def ggl_trends(grouped, keyword): pytrends = TrendReq(hl='en-US', tz=360) kw_list = [keyword] pytrends.build_payload(kw_list, cat=0, timeframe='all', geo='US', gprop='') ggl_trends = pytrends.interest_over_time() if ggl_trends.empty: return pd.DataFrame() grouped_ggl_trends = ggl_trends.groupby(pd.Grouper(freq='1m')).mean().rename(columns={keyword: 'Google Trends'}) return grouped.merge(grouped_ggl_trends, left_index=True, right_index=True, how='inner')
class GoogleTrendStatsEvaluator(StatsSocialEvaluator): def __init__(self): super().__init__() self.pytrends = None self.is_threaded = False # Use pytrends lib (https://github.com/GeneralMills/pytrends) # https://github.com/GeneralMills/pytrends/blob/master/examples/example.py def get_data(self): self.pytrends = TrendReq(hl='en-US', tz=0) # self.pytrends.GENERAL_URL = "https://trends.google.com/trends/explore" # self.symbol key_words = [self.symbol] try: # looks like only 1 and 3 months are working ... time_frame = "today " + str(self.social_config[STATS_EVALUATOR_HISTORY_TIME]) + "-m" # Attention apparement limite de request / h assez faible self.pytrends.build_payload(kw_list=key_words, cat=0, timeframe=time_frame, geo='', gprop='') except ResponseError as e: self.logger.warn(str(e)) def eval_impl(self): interest_over_time_df = self.pytrends.interest_over_time() # compute bollinger bands self.eval_note = AdvancedManager.get_class(self.config, StatisticAnalysis).analyse_recent_trend_changes( interest_over_time_df[self.symbol], numpy.sqrt) def run(self): pass # check if history is not too high def load_config(self): super(GoogleTrendStatsEvaluator, self).load_config() if self.social_config[STATS_EVALUATOR_HISTORY_TIME] > STATS_EVALUATOR_MAX_HISTORY_TIME: self.social_config[STATS_EVALUATOR_HISTORY_TIME] = STATS_EVALUATOR_MAX_HISTORY_TIME def set_default_config(self): self.social_config = { CONFIG_REFRESH_RATE: 3600, STATS_EVALUATOR_HISTORY_TIME: 3 }
def scrape(): cur_dir = os.getcwd() os.chdir(cur_dir) if request.method == 'POST': MONGODB_HOST = 'localhost' MONGODB_PORT = 27017 DBS_NAME = 'donorschoose' COLLECTION_NAME = 'projects' FIELDS = {} connection = MongoClient(MONGODB_HOST, MONGODB_PORT) collection = connection[DBS_NAME][COLLECTION_NAME] projects = collection.find(projection=FIELDS, limit=5000) #Reading in the classification model model = pickle.load(open(cur_dir + "/fakenews_model.dat", "rb")) #Reading in the URL input by the user entered_url = request.form['text'] #Creating an article object article = newspaper.article.Article(url=entered_url) article.download() article.parse() article.nlp() #Extracting the article text text = article.text d = {'text': [text]} text_df = pd.DataFrame(data=d) externaltestinput = text_df #Begin Classification of Article #Clean the data and extract features externaltest = fe.clean_text_add_features(externaltestinput) #Vectorize the word externaltest = fe.word_vectors(externaltest) #Structuring the data externaldata = externaltest[[ 'specialchar', 'wordcount', 'avewordlength', 'firstpersonwordcount', 'uniquewords', 'capitalizedwords', 'vector' ]] externaldata2 = pd.concat( [externaldata['vector'].apply(pd.Series), externaldata], axis=1) externaldata2.drop('vector', axis=1, inplace=True) externaldata2.dropna(inplace=True) externaldata_predictor = externaldata2.as_matrix() SC = externaldata2['specialchar'][0] CW = externaldata2['capitalizedwords'][0] AWL = externaldata2['avewordlength'][0] awl_data = [ 100 * np.abs(1 - float(5 / AWL)), 100 * np.abs(float(5 / AWL)) ] cw_data = [100 * float(CW), 1000 * (1 - float(CW))] sc_data = [(1000 * np.abs(SC)), 100 - (1000 * np.abs(SC))] #Running the model and storing the result xgb_externaltest_output = model.predict(externaldata_predictor) if (xgb_externaltest_output): classification = "FAKE" else: classification = "REAL" # getting related words tagged_art = pos_tag(article.text.split()) all_nouns = [ word for word, pos in tagged_art if (pos == 'NN' or pos == 'NNP') ] count = Counter(all_nouns) most_cmn = count.most_common()[0:5] noun_list = [None] * 5 for i in range(0, 5): noun_list[i] = str(most_cmn[i][0]) all_combos = list(combinations(noun_list, 2)) places = GeoText(article.text) all_cities = places.cities pytrend = TrendReq() #tf = 'now 7-d' tf = 'today 1-m' all_cities = list(set(all_cities)) # checking if any combinations are cities prev_max = 0 occur = dict(most_cmn) i = 0 for item in all_combos: for city in all_cities: c1 = "" + item[0] + " " + item[1] c2 = "" + item[1] + " " + item[0] if (c1 in city or c2 in city): # remove both items, add back city # and higher occurence item # if equal, remove both noun_list.remove(item[0]) noun_list.remove(item[1]) city_name = str(city).replace(" City", "") #if there are the same number of occurences if (occur[item[0]] == occur[item[1]]): noun_list.append(city_name) elif (occur[item[0]] > occur[item[1]]): noun_list.append(city_name) noun_list.append(item[0]) else: noun_list.append(city_name) noun_list.append(item[1]) i = i + 1 final_combos = list(combinations(noun_list, 2)) all_results = pd.DataFrame( columns=['search_term', 'interest', 'related'], index=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) j = 0 for item in final_combos: query = item[0] + " " + item[1] query = query.translate(string.punctuation) pytrend.build_payload(kw_list=[query], timeframe=tf) interest_over_time_df = pytrend.interest_over_time() related_queries_dict = pytrend.related_queries() # using the rising trends to observe the most popular search rise_query_df = related_queries_dict[query]['rising'] all_results.iloc[j]['search_term'] = query all_results.iloc[j]['interest'] = interest_over_time_df all_results.iloc[j]['related'] = rise_query_df if (rise_query_df is not None): max_line = rise_query_df.loc[rise_query_df['value'].idxmax()] max_val = max_line['value'] max_val_query = max_line['query'] if (max_val > prev_max): prev_max = max_val prev_query = max_val_query best = related_queries_dict j = j + 1 pytrend.build_payload(kw_list=[prev_query], timeframe=tf) interest_over_time_df = pytrend.interest_over_time() df = interest_over_time_df.drop(labels='isPartial', axis=1) py.iplot([{ 'x': df.index, 'y': df[col], 'name': col } for col in df.columns], filename='simple-line') import networkx as nx G = nx.Graph() for i in range(all_results.shape[0]): n1 = all_results.iloc[i] G.add_node(n1['search_term']) if (i != 0): edge = (n1['search_term'], all_results.iloc[i - 1]['search_term']) G.add_edge(*edge) #else: # G.node[0]['pos'] = (10, 5) if (all_results.iloc[i]['related'] is not None): for j in range(n1['related'].shape[0]): n2 = n1['related'].iloc[j] G.add_node(n2['query']) edge = (n1['search_term'], n2['query']) G.add_edge(*edge) pos = nx.spring_layout(G, k=2, iterations=20) nx.draw(G, node_size=1000, node_color='c', pos=pos, with_labels=True) #plt.savefig("simple_path.png") #plt.show() edge_trace = Scatter(x=[], y=[], line=Line(width=0.5, color='#888'), hoverinfo='none', mode='lines') node_trace = Scatter(x=[], y=[], text=list(G.nodes()), mode='markers', hoverinfo='text', marker=Marker(showscale=False, colorscale='YIGnBu', reversescale=True, color=[], size=20, colorbar=dict( thickness=15, title='Node Connections', xanchor='left', titleside='right'), line=dict(width=2))) for node in G.nodes(): node_trace['x'].append(pos[node][0]) node_trace['y'].append(pos[node][1]) for edge in G.edges(): x0, y0 = pos[edge[0]] x1, y1 = pos[edge[1]] edge_trace['x'] += [x0, x1, None] edge_trace['y'] += [y0, y1, None] for node in enumerate(G.nodes()): node_trace['marker']['color'].append(0) node_info = str(node) node_trace['text'].append(node_info) fig = Figure(data=Data([edge_trace, node_trace]), layout=Layout(showlegend=False, hovermode='closest', margin=dict(b=20, l=5, r=5, t=40), annotations=[ dict(showarrow=False, xref="paper", yref="paper", x=0.005, y=-0.002) ], xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False), yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False))) py.iplot(fig, filename='networkx') #Update plotly plots based on the results from the model fig = { "data": [{ "values": [], "hoverinfo": "none", "marker": { "colors": [] }, "textinfo": "none", "hole": 0.6, "type": "pie" }], "layout": { "showlegend": False, "annotations": [{ "text": "", "font": { "size": 10 }, "showarrow": False }], "title": classification, "titlefont": { "family": "Courier New", "size": 180 }, "margin": { "t": 600 } } } py.iplot(fig, filename='classification') #import plotly.plotly as py #from plotly.graph_objs import * trace1 = { "domain": { "x": [0, 0.31], "y": [0.1, 1] }, "hole": 0.6, "hoverinfo": "none", "labels": ["Data", ""], "marker": { "colors": ["rgb(53, 196, 170)", "rgb(255, 255, 255)"], "line": { "color": ["rgb(0, 0, 0)"], "width": 2 } }, "name": "CW", "textinfo": "none", "type": "pie", "values": cw_data } trace2 = { "domain": { "x": [0.33, 0.64], "y": [0.1, 1] }, "hole": 0.6, "hoverinfo": "none", "labels": ["Data", ""], "marker": { "colors": ["rgb(53, 196, 170)", "rgb(255, 255, 255)"], "line": { "color": ["rgb(0, 0, 0)"], "width": 2 } }, "name": "SC", "textinfo": "none", "type": "pie", "values": sc_data } trace3 = { "domain": { "x": [0.66, 1], "y": [0.1, 1] }, "hole": 0.6, "hoverinfo": "none", "labels": ["Data", ""], "marker": { "colors": ["rgb(53, 196, 170)", "rgb(255, 255, 255)"], "line": { "color": ["rgb(0, 0, 0)"], "width": 2 } }, "name": "AWL", "textinfo": "none", "type": "pie", "values": awl_data } fig = { "data": [trace1, trace2, trace3], "layout": { "annotations": [{ "x": 0.1, "y": 0.12, "font": { "size": 16 }, "showarrow": False, "text": "# Cap. Words" }, { "x": 0.12, "y": 0.04, "font": { "size": 16 }, "showarrow": False, "text": str(np.round(cw_data[0], 0)) + "%" }, { "x": 0.47, "y": 0.12, "font": { "size": 16 }, "showarrow": False, "text": "# Special Char" }, { "x": 0.46, "y": 0.04, "font": { "size": 16 }, "showarrow": False, "text": str(np.round(sc_data[0], 0)) + "%" }, { "x": 0.9, "y": 0.12, "font": { "size": 16 }, "showarrow": False, "text": "Avg. Word Len" }, { "x": 0.85, "y": 0.04, "font": { "size": 16 }, "showarrow": False, "text": str(np.round(awl_data[0], 0)) + "%" }], "showlegend": False, "title": "Score Component Breakdown" } } py.iplot(fig, filename='score_bd') connection.close() return render_template("output.html", article=article)
def test_interest_over_time(self): pytrend = TrendReq() pytrend.build_payload(kw_list=['pizza', 'bagel']) self.assertIsNotNone(pytrend.interest_over_time())
class GoogleTrendsData(object): '''Class to get data from Google Trends concurrently.''' def __init__(self, kw: list, normalize: bool, category=0, timezone=0, timeframe='today 5-y', geo='US', gprop=''): self.kw = kw self.normalize = normalize self.cat = category self.tf = timeframe self.geo = geo self.gprop = gprop self.pytrends = TrendReq(hl='en-US', tz=timezone) def __repr__(self): return f'Lookup {self.kw}, {"normalized" if self.normalize else "not normalized"}.' def norm_str(self, space=False): if space: return 'Normalized' if self.normalize else 'Not Normalized' return 'Normalized' if self.normalize else 'NotNormalized' # Multiprocessing def get(self, processes=10): """Handles multiprocessing using ThreadPool; sends items from a list to a function and gets the results as a list""" # If we already have the data, get it from the CSV file without talking to Google file_name = 'output/' + ''.join(self.kw) + (self.norm_str()) + '.csv' try: data = pd.read_csv(file_name) print('Data cached. Reading csv...') # Convert the date column from str to datetime data['date'] = pd.to_datetime(data['date']) data.set_index('date', drop=True, inplace=True) return data except FileNotFoundError: print('Connecting to Google.') pass # If we want to normalize, bypass threading if self.normalize: result = self.gen_data(self.kw) # If we get an array back instead of a DataFrame we are rate limited try: result.drop('isPartial', axis=1) except AttributeError: sys.exit('Rate limited.') return result.drop('isPartial', axis=1) # Define the number of processes, use less than or equal to the defined value count_threads = min(processes, len(self.kw)) if count_threads == 0: return [] pool = ThreadPool(count_threads) # Tell the user what is happening print(f"Getting {len(self.kw)} items in {count_threads} processes.") # Calls gen_data() and adds the filesize returned each call to an self.kw result = list(pool.imap_unordered(self.gen_data, self.kw)) pool.close() pool.join() # Result is a list of each different Pandas Dataframe, so we concatenate them together try: result = pd.concat(result, axis=1, join='inner').drop('isPartial', axis=1) except TypeError: sys.exit('Rate limited.') return result def gen_data(self, keywords): '''Generate a Pandas Dataframe based on the keyword(s) passed''' # Handle when we are passed a list of single letters if len(keywords[0]) == 1: keywords = [''.join(keywords)] if self.normalize: # Raise error before we send the request if len(keywords) > 5 and isinstance(keywords, list): raise ValueError('Too many keywords for normalizaion.') try: self.pytrends.build_payload(keywords, cat=self.cat, timeframe=self.tf, geo=self.geo, gprop=self.gprop) except ResponseError: return [] data = self.pytrends.interest_over_time() return data # Handle when we are not normalizing the data else: for keyword in keywords: print(f'Getting {keyword}') # Build the dataset with the first keyword if keyword == keywords[0]: try: self.pytrends.build_payload([keyword], cat=self.cat, timeframe=self.tf, geo=self.geo, gprop=self.gprop) except ResponseError: return [] data = self.pytrends.interest_over_time() continue # After we have the dataset we append the new data self.pytrends.build_payload([keyword], self.cat, self.tf, self.geo, self.gprop) data[keyword] = self.pytrends.interest_over_time()[keyword] # Rearrange columns cols = list(data.columns.values) cols.append(cols.pop(cols.index('isPartial'))) return data[cols] def graph(self, data, filename='o'): p = data.plot(x=data.index) p.set_title(f'Interest Over Time: {self.norm_str(True)}') p.set_ylabel('Interest Level') p.set_xlabel('Date') p.get_figure().savefig(f'{filename}.png') return p def save(self, d): d.to_csv(f'./output/{"".join(self.kw)}{self.norm_str()}.csv')
def get_google_trends(self, kw_list, trdays=250, overlap=100, cat=0, geo='', tz=360, gprop='', hl='en-US', sleeptime=1, isPartial_col=False, from_start=False, scale_cols=True): """Retrieve daily google trends data for a list of search terms Parameters ---------- kw_list : list of search terms (max 5)- see pyTrends for more details trdays : the number of days to pull data for in a search (the max is around 270, though the website seems to indicate 90) overlap : the number of overlapped days when stitching two searches together cat : category to narrow results - see pyTrends for more details geo : two letter country abbreviation (e.g 'US', 'UK') default is '', which returns global results - see pyTrends for more details tz : timezone offset (default is 360, which corresponds to US CST - see pyTrends for more details) grop : filter results to specific google property available options are 'images', 'news', 'youtube' or 'froogle' default is '', which refers to web searches - see pyTrends for more details hl : language (e.g. 'en-US' (default), 'es') - see pyTrends for more details sleeptime : when stiching multiple searches, this sets the period between each isPartial_col : remove the isPartial column (default is True i.e. column is removed) from_start : when stitching multiple results, this determines whether searches are combined going forward or backwards in time (default is False, meaning searches are stitched with the most recent first) scale_cols : google trend searches traditionally returns scores between 0 and 100 stitching could produce values greater than 100 by setting this to True (default), the values will range between 0 and 100 Returns ------- pandas Dataframe Notes ----- This method is essentially a highly restricted wrapper for the pytrends package Any issues/questions related to its use would probably be more likely resolved by consulting the pytrends github page https://github.com/GeneralMills/pytrends """ if len(kw_list) > 5 or len(kw_list) == 0: raise ValueError("The keyword list can contain at most 5 words") if trdays > 270: raise ValueError("trdays must not exceed 270") if overlap >= trdays: raise ValueError("Overlap can't exceed search days") stich_overlap = trdays - overlap from_date = datetime.datetime.strptime(self.from_date, '%Y-%m-%d') to_date = datetime.datetime.strptime(self.to_date, '%Y-%m-%d') n_days = (to_date - from_date).days # launch pytrends request _pytrends = TrendReq(hl=hl, tz=tz) # get the dates for each search if n_days <= trdays: trend_dates = [' '.join([self.from_date, self.to_date])] else: trend_dates = [ '{} {}'.format( (to_date - datetime.timedelta(i + trdays)).strftime("%Y-%m-%d"), (to_date - datetime.timedelta(i)).strftime("%Y-%m-%d")) for i in range(0, n_days - trdays + stich_overlap, stich_overlap) ] if from_start: trend_dates = trend_dates[::-1] try: _pytrends.build_payload(kw_list, cat=cat, timeframe=trend_dates[0], geo=geo, gprop=gprop) except: raise output = _pytrends.interest_over_time().reset_index() if len(output) == 0: raise ValueError( 'search term returned no results (insufficient data)') for date in trend_dates[1:]: time.sleep(sleeptime) try: _pytrends.build_payload(kw_list, cat=cat, timeframe=date, geo=geo, gprop=gprop) except: raise temp_trend = _pytrends.interest_over_time().reset_index() temp_trend = temp_trend.merge(output, on="date", how="left") # it's ugly but we'll exploit the common column names # and then rename the underscore containing column names for kw in kw_list: norm_factor = np.ma.masked_invalid( temp_trend[kw + '_y'] / temp_trend[kw + '_x']).mean() temp_trend[kw] = temp_trend[kw + '_x'] * norm_factor temp_trend = temp_trend[temp_trend.isnull().any(axis=1)] temp_trend['isPartial'] = temp_trend['isPartial_x'] output = pd.concat( [output, temp_trend[['date', 'isPartial'] + kw_list]], axis=0, sort=False) # reorder columns in alphabetical order output = output[['date', 'isPartial'] + kw_list] if not isPartial_col: output = output.drop('isPartial', axis=1) output = output[output['date'] >= self.from_date] if scale_cols: # the values in each column are relative to other columns # so we need to get the maximum value across the search columns max_val = float(output[kw_list].values.max()) for col in kw_list: output[col] = 100.0 * output[col] / max_val output = output.sort_values( 'date', ascending=self.ascending).reset_index(drop=True) return output
""" # The keyword list collection can take up to 5 keywords. kw_list = ["covid"] # The Payload - (Requester). # GET DATA FOR INTEREST OVERTIME # Define the parameters for the payload requester for 30-days. pytrends.build_payload(kw_list, cat=0, timeframe='today 1-m', geo='GB', gprop='') # Execute the payload request. pd_iot_thirty_days = pytrends.interest_over_time() # Remove index. pd_iot_thirty_days.reset_index(inplace=True) # Rename API Columns. pd_iot_thirty_days.rename(columns={'date': 'Date', 'covid': 'Value'}, inplace=True) # Add additional columns required for the report. pd_iot_thirty_days["Label"] = pd_iot_thirty_days["Value"] pd_iot_thirty_days["Range"] = "Last-30-Days" # Sort pd_iot_thirty_days By Date. pd_iot_thirty_days.sort_values(by=["Date"], inplace=True, ascending=True) # Print returned output for the pd_iot_thirty_days payload request.
from pytrends.request import TrendReq pytrends = TrendReq(hl='ja-JP', tz=360) kw_list = ["Trump"] pytrends.build_payload(kw_list, cat=0, timeframe='today 1-m', geo='US', gprop='') result = pytrends.interest_over_time() # result = pytrends.get_historical_interest(kw_list, year_start=2018, month_start=1, day_start=1, hour_start=0, year_end=2018, month_end=2, day_end=1, hour_end=0, cat=0, geo='', gprop='', sleep=0) # result = pytrends.interest_by_region(resolution='DMA', inc_low_vol=True, inc_geo_code=False) # result = pytrends.related_topics() result.to_csv('data' + "" + '.csv', encoding='utf_8')
def test_interest_over_time(self): pytrend = TrendReq() pytrend.build_payload(kw_list=['pizza', 'bagel']) self.assertIsNotNone(pytrend.interest_over_time())
class myThread(threading.Thread): def __init__(self, keyword, timeframe, count): threading.Thread.__init__(self) # python google trend tool self.pytrend = TrendReq() # search keyword self.keyword = keyword # start time self.timeframe = timeframe # thread count self.count = count print("Thread " + str(self.count) + " " + self.timeframe + " is starting.") def run(self): # create keyword list keyword_list = [] keyword_list.append(self.keyword) # random select sleep time sleep_time = random.randint(5, 10) # time sleep between each request with sema: interest = self.download(keyword_list) time.sleep(sleep_time) # global result global result try: result_lock.acquire() # add interest to result list result.append(interest) print("Thread " + str(self.count) + " " + self.timeframe + " is finished.") finally: result_lock.release() ''' input: search keyword list: string list output: interest dataframe exception: connection abortion ''' def download(self, keyword_list): try: # call google trend api self.pytrend.build_payload(kw_list=keyword_list, cat=0, timeframe=self.timeframe, geo='', gprop='') except (Exception, OSError, requests.ConnectionError) as e: print("Thread " + str(self.count) + " " + self.timeframe + " goes wrong.") print(e) print("Retry downloading " + "Thread " + str(self.count) + " " + self.timeframe) # random select sleep time sleep_time = random.randint(10, 30) time.sleep(sleep_time) interest = self.download(keyword_list) return interest else: # get the dataframe of interest interest = self.pytrend.interest_over_time() return interest finally: pass
def tweet_text(image_path): img = cv2.imread(image_path) try: string = pytesseract.image_to_string(img) except TypeError: return 'imgur format' stop_words = set(stopwords.words('english')) print(string) word_tokens = word_tokenize(string) filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) filtered_sentence = filtered_sentence[3:] print(filtered_sentence) with open('1-1000.txt','r') as fin: lines = fin.readlines() common = [] for line in lines: common.append(line.rstrip('\n')) filtered_sentence = [s for s in filtered_sentence if not s.lower() in common] filtered_sentence = [x for x in filtered_sentence if len(x) > 2 and '\n' not in x and '.' not in x and ',' not in x and 'Retweets' not in x and 'Likes' not in x and 'iPhone' not in x and 'Twitter' not in x and '/' not in x and 'Retweeted' not in x and '|' not in x and '©' not in x and '>' not in x and 'Comments' not in x and ':' not in x and '-' not in x and 'ing' not in x] filtered_sentence = list(dict.fromkeys(filtered_sentence)) print(filtered_sentence) trends_values_ca = [] trends_values_ny = [] for items in filtered_sentence: temp = [] temp.append(items) if len(items) > 0: pytrend_ca = TrendReq(hl='en-US', tz=360) pytrend_ny = TrendReq(hl='en-US', tz=360) pytrend_ca.build_payload(kw_list=list(temp), timeframe='now 1-d', geo='US-CA') pytrend_ny.build_payload(kw_list=list(temp), timeframe='now 1-d', geo='US-NY') df_ca = pytrend_ca.interest_over_time() df_ny = pytrend_ny.interest_over_time() try: trends_values_ca.append(sum(list(df_ca[items])[-20:])/(len(df_ca.index)-20)) except KeyError: trends_values_ca.append(0) try: trends_values_ny.append(sum(list(df_ny[items])[-20:])/(len(df_ca.index)-20)) except KeyError: trends_values_ny.append(0) filtered_sentence = sorted((list(zip(filtered_sentence, list(map(lambda x, y: (x*0.82+y*0.18)/2, trends_values_ca, trends_values_ny))))), key=itemgetter(1), reverse=True) print(filtered_sentence) if len(filtered_sentence) > 3: print("#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0] + " #" + filtered_sentence[3][0]) return "#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0] + " #" + filtered_sentence[3][0] elif len(filtered_sentence) == 3: print("#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0]) return "#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] + " #" + filtered_sentence[2][0] elif len(filtered_sentence) == 2: print("#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0]) return "#" + filtered_sentence[0][0] + " #" + filtered_sentence[1][0] elif len(filtered_sentence) == 1: print("#" + filtered_sentence[0][0]) return "#" + filtered_sentence[0][0] else: return ""
kw_list.remove('2017') else: time_start = str(df.post_published[i].date() - timedelta(days=7)) time_stop = str(df.post_published[i].date() + timedelta(days=7)) timeframe = time_start + ' ' + time_stop pytrends = TrendReq(hl='en-US', tz=360) pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo='', gprop='') trends = pytrends.interest_over_time() if trends.shape[0] < 15: kw_list = kw_list[0].split() pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo='', gprop='') trends = pytrends.interest_over_time() trends.sum(axis=1) df_trends = df_trends.append( pd.DataFrame(data=[ np.append([df.post_published[i], df.tags[i]], trends.sum(axis=1).values) ],
class Trender: ''' class that keeps track of terms knows how to plot a term ''' def __init__(self, terms, daterange, locality): self.terms = [] self.update = True self.pytrend = TrendReq() self.daterange = daterange self.locality = locality if daterange == None: daterange = "today 12-m" if locality == None: locality == "DK" for term in terms: self.terms.append( TermTrend( term, self._getTrend([term], self.daterange, self.locality))) def plot(self, plotout): df = self.combinedtrend.copy().reset_index() x = list(df.columns)[0] ys = list(df.columns)[1:] plot = interactive_plot(df, x, ys) #print("Saving plot...") #if plotout == None: # plotout = "plot.png" #plot.write_image(plotout) def update_trends(self): self.update = True self.trends() def trends(self): if self.update: #returns trend term_faceoff = [] combinations_list = getCombinations(self.terms) tmp_combined_trend = dict() for comb in combinations_list: combined_trend = self._getTrend([comb[0].term, comb[1].term], self.daterange, self.locality) term = faceoff(comb[0], comb[1], combined_trend) tmp_combined_trend[(comb[0], comb[1])] = combined_trend term_faceoff.append(term) self.update = False term_faceoff = list(Counter(term_faceoff)) #create dataframe self.combinedtrend = pd.DataFrame( index=term_faceoff[0].trend.index.copy(), data={term_faceoff[0].term: term_faceoff[0].trend.iloc[:, 0]}) scalefactor_list = [] for i in range(len(term_faceoff) - 1): combined_trend = None if ((term_faceoff[i], term_faceoff[i + 1])) in tmp_combined_trend.keys(): combined_trend = tmp_combined_trend[(term_faceoff[i], term_faceoff[i + 1])] else: combined_trend = tmp_combined_trend[(term_faceoff[i + 1], term_faceoff[i])] scalefactor = getScaleFactor(combined_trend, term_faceoff[i + 1]) scalefactor_list.append(scalefactor) scaled_trend = term_faceoff[i + 1].trend.iloc[:, 0] print(scaled_trend.head()) for scale in scalefactor_list: scaled_trend = scaled_trend.apply( lambda x: x / 100 * scale) print(scaled_trend.head()) self.combinedtrend[term_faceoff[i + 1].term] = scaled_trend print(self.combinedtrend.head()) else: return self.combinedtrend def _getTrend(self, payload, daterange, locality): payload_built = False while not payload_built: try: self.pytrend.build_payload(kw_list=payload, timeframe=daterange, geo=locality) payload_built = True except: print("Rebuilding payload...") time.sleep(random.randint(1, 4)) pass results = None tries = 0 print("Fetching trend:", payload) while results is None: try: results = self.pytrend.interest_over_time() results.to_csv("out.csv") except: print("Retrying...") tries += 1 time.sleep(random.randint(1, 4)) if tries >= 5: print("Error. Could not connect to server instance.") else: pass return results
def test_ispartial_dtype_timeframe_all(self): pytrend = TrendReq() pytrend.build_payload(kw_list=['pizza', 'bagel'], timeframe='all') df = pytrend.interest_over_time() assert ptypes.is_bool_dtype(df.isPartial)
def download_ght_by_states_today(kw_list): pytrend = TrendReq() states = ["AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MT", "NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","MD","MA","MI","MN","MS","MO","PA","RI","SC","SD", "TN", "TX","UT","VT","VA","WA","WV","WI","WY"] def state_ind_to_name(): state_dict = {'US-AL': 'Alabama', 'US-AK': 'Alaska', 'US-AZ': 'Arizona', 'US-AR': 'Arkansas', 'US-CA': 'California', 'US-CO': 'Colorado', 'US-CT': 'Connecticut', 'US-DE': 'Delaware', 'US-DC': 'District of Columbia', 'US-FL': 'Florida', 'US-GA': 'Georgia', 'US-HI': 'Hawaii', 'US-ID': 'Idaho', 'US-IL': 'Illinois', 'US-IN': 'Indiana', 'US-IA': 'Iowa', 'US-KS': 'Kansas', 'US-KY': 'Kentucky', 'US-LA': 'Louisiana', 'US-ME': 'Maine', 'US-MD': 'Maryland', 'US-MA': 'Massachusetts', 'US-MI': 'Michigan', 'US-MN': 'Minnesota', 'US-MS': 'Mississippi', 'US-MO': 'Missouri', 'US-MT': 'Montana', 'US-NE': 'Nebraska', 'US-NV': 'Nevada', 'US-NH': 'New Hampshire', 'US-NJ': 'New Jersey', 'US-NM': 'New Mexico', 'US-NY': 'New York', 'US-NC': 'North Carolina', 'US-ND': 'North Dakota', 'US-OH': 'Ohio', 'US-OK': 'Oklahoma', 'US-OR': 'Oregon', 'US-PA': 'Pennsylvania', 'US-RI': 'Rhode Island', 'US-SC': 'South Carolina', 'US-SD': 'South Dakota', 'US-TN': 'Tennessee', 'US-TX': 'Texas', 'US-UT': 'Utah', 'US-VT': 'Vermont', 'US-VA': 'Virginia', 'US-WA': 'Washington', 'US-WV': 'West Virginia', 'US-WI': 'Wisconsin', 'US-WY': 'Wyoming'} return state_dict i=0 df=pd.DataFrame() st_name = state_ind_to_name() for st_ind in states: pytrend.build_payload(kw_list, geo='US-'+st_ind,timeframe='today 5-y') print('region processed US-'+st_ind) if i==0: df1 = pytrend.interest_over_time() df1 = df1.assign(state=st_name['US-'+st_ind]) df = df1 else: df1 = pytrend.interest_over_time() df1 = df1.assign(state=st_name['US-'+st_ind]) df = df.append(df1) i+=1 # df = pytrend.interest_over_time() # df = df.append(df) filename = filepath + 'ght_state-'+ str(epi.Week.thisweek().year) + "{:02d}".format(epi.Week.thisweek().week)+'.csv' df.to_csv(filename) return df
#must have pytrends installed from pytrends.request import TrendReq pytrends = TrendReq(hl='en-US', tz=360) print "debug 1" pytrends.build_payload(kw_list=['coinbase'], timeframe='now 1-H') print "debug 2" testdata = pytrends.interest_over_time() print "debug 3" print testdata
df = pickle.load(f) print('Loaded {} from cache'.format(quandl_id)) except (OSError, IOError) as e: print('Downloading {} from Quandl'.format(quandl_id)) df = quandl.get(quandl_id, returns="pandas") df.to_pickle(cache_path) print('Cached {} at {}'.format(quandl_id, cache_path)) return df google_trends = TrendReq(hl='en-US', tz=360) google_trends.build_payload( kw_list=[search_term], timeframe='today 3-m', ) df_interest = google_trends.interest_over_time() df_price = cached_fetch_quantl(quandl_data_id) '''Get start date and drop previous data''' start = pd.to_datetime('today') - relativedelta(months=3) start = start.date() df_price = df_price.loc[start:] df_interest = df_interest.loc[start:] df = pd.concat([ df_interest[search_term], df_price['Weighted Price'], df_price['Volume (BTC)'] ], axis=1) '''Plot data''' fig, ax1 = plt.subplots()
if os.path.exists(out_file): print(coin + " has been downloaded.") continue # Run the first time (if we want to start from today, otherwise we need to ask for an end_date as well # today = datetime.today().date() today = end_date old_date = today # Go back in time new_date = today - timedelta(days=step) # Create new timeframe for which we download data timeframe = new_date.strftime('%Y-%m-%d') + ' ' + old_date.strftime( '%Y-%m-%d') pytrend.build_payload(kw_list=kw_list, timeframe=timeframe) interest_over_time_df = pytrend.interest_over_time() data_flag = 1 ## RUN ITERATIONS while new_date > start_date: ### Save the new date from the previous iteration. # Overlap == 1 would mean that we start where we # stopped on the iteration before, which gives us # indeed overlap == 1. old_date = new_date + timedelta(days=overlap - 1) ### Update the new date to take a step into the past # Since the timeframe that we can apply for daily data # is limited, we use step = maxstep - overlap instead of # maxstep.
def pullTrends(kw_list, start_date, end_date): from pytrends.request import TrendReq pytrends = TrendReq(hl='en-US', tz=360) term = kw_list[0] def toTimeframe(ts1, ts2): s1 = f"{ts1:%Y-%m-%d}" s2 = f"{ts2:%Y-%m-%d}" return "{0} {1}".format(s1, s2) def diff_month(d1, d2): return (d1.year - d2.year) * 12 + d1.month - d2.month def next_month(d): if (d.month == 12): d = d.replace(year=d.year + 1, month=1) else: d = d.replace(month=d.month + 1) return d def last_day_of_month(d): d = d.replace(day=1) d = next_month(d) d += timedelta(days=-1) return d def first_day_of_month(d): return d.replace(day=1) def renormalize(df): months, daylist = df for i in range(months.shape[0]): daylist[i][term] = daylist[i][term].apply( lambda x: x * months.iloc[i][term]) return daylist def flatten(daylist): flattened = daylist[0] for i in range(1, len(daylist)): flattened = flattened.append(daylist[i]) return flattened start_date = pd.to_datetime(start_date) st_date = start_date end_date = pd.to_datetime(end_date) n_months = diff_month(end_date, start_date) # pull months # hacky fix dont change if n_months < 63: start_date_tmp = start_date.replace(year=start_date.year - int((64 - n_months + 11) / 12)) else: start_date_tmp = start_date # to string start_date_str = f"{start_date_tmp:%Y-%m-%d}" end_date_str = f"{end_date:%Y-%m-%d}" #get monthly pytrends.build_payload(kw_list, cat=0, timeframe="{0} {1}".format(start_date_str, end_date_str), geo='US', gprop='') monthly = pytrends.interest_over_time() monthly = monthly[monthly.index > st_date] #get daily start_date = first_day_of_month(st_date) tmp_end_date = start_date tmp_end_date = last_day_of_month(start_date) daylist = [] for i in range(n_months): pytrends.build_payload(kw_list, cat=0, timeframe=toTimeframe(start_date, tmp_end_date), geo='US', gprop='') daily = pytrends.interest_over_time() daylist.append(daily) start_date = next_month(start_date) tmp_end_date = last_day_of_month(start_date) return flatten(renormalize((monthly, daylist))).drop(['isPartial'], axis=1)
_rolling_dates = [ ' '.join( map(lambda x: x.strftime(_date_fmt), [_tmp_range[i], _tmp_range[i + 1]])) for i in range(len(_tmp_range) - 1) ] # initialization of the major data frame _df_trends # _dates will contains our last playload argument _dates = _rolling_dates[0] _pytrends.build_payload(_kw_list, cat=_cat, timeframe=_dates, geo=_geo, gprop=_gprop) _df_trends = _pytrends.interest_over_time() for _dates in _rolling_dates[1:]: # we need to normalize data before concatanation _common_date = _dates.split(' ')[0] _pytrends.build_payload(_kw_list, cat=_cat, timeframe=_dates, geo=_geo, gprop=_gprop) _tmp_df = _pytrends.interest_over_time() _multiplication_factor = _df_trends.loc[_common_date] / \ _tmp_df.loc[_common_date] _df_trends = ( pd.concat([_df_trends, (_tmp_df[1:] * _multiplication_factor)
def pull_keywords_trend(keywords_list, keyword_short_name, time_frame, geo='US', save_folder=None, relative_to_each_other=True): """ :param keywords_list: up to 5 :param time_frame: Specific dates, 'YYYY-MM-DD YYYY-MM-DD' example '2016-12-14 2017-01-25' :param geo: :param save_folder: the path to save, optional :param relative_to_each_other: # notes on the keywords: last time used: ['AMGN','CELG','BIIB','GILD','REGN'] # notes in the timeframe - Date to start from - Defaults to last 5yrs, 'today 5-y'. - Everything 'all' - Specific dates, 'YYYY-MM-DD YYYY-MM-DD' example '2016-12-14 2017-01-25' - Specific datetimes, 'YYYY-MM-DDTHH YYYY-MM-DDTHH' example '2017-02-06T10 2017-02-12T07' Note Time component is based off UTC Current Time Minus Time Pattern: By Month: 'today #-m' where # is the number of months from that date to pull data for For example: 'today 3-m' would get data from today to 3months ago NOTE Google uses UTC date as 'today' Seems to only work for 1, 2, 3 months only Daily: 'now #-d' where # is the number of days from that date to pull data for For example: 'now 7-d' would get data from the last week Seems to only work for 1, 7 days only Hourly: 'now #-H' where # is the number of hours from that date to pull data for For example: 'now 1-H' would get data from the last hour Seems to only work for 1, 4 hours only 35 weeks is by day 50 weeks is by week :return: DataFrame """ # , proxies=['https://35.201.123.31:880', ] pytrends = TrendReq(hl='en-US', tz=360) # tz is time zone offset in minutes if relative_to_each_other: pytrends.build_payload(keywords_list, cat=0, timeframe=time_frame, geo=geo, gprop='') interest_over_time_df = pytrends.interest_over_time() else: print( 'means that the keywords will be pulled one by one.. with each one has a 100, in the period' ) interest_over_time_df = pd.DataFrame() for keyword in keywords_list: pytrends.build_payload([keyword], cat=0, timeframe=time_frame, geo=geo, gprop='') if len(interest_over_time_df) == 0: interest_over_time_df = pytrends.interest_over_time() else: interest_over_time_df[keyword] = pytrends.interest_over_time( )[keyword] # now lets check if it is by week or by day a_delta = interest_over_time_df.index[1] - interest_over_time_df.index[0] if a_delta.days > 1: by_day = False print('This trend is by week') else: by_day = True print('This trend is by day') # now lets get the file name for the pulled trend file_name = f'{keyword_short_name}_{relative_to_each_other}_{"by_day" if by_day else "by_week"}.csv' print(f'trend file name is {file_name}') if save_folder: interest_over_time_df.to_csv(os.path.join(save_folder, file_name)) print(f'Total lines: {len(interest_over_time_df)}') print(interest_over_time_df) return interest_over_time_df
def graph(request): words = request.GET.get('q') if not words: return redirect('tot:index') date = request.GET.get('y') a = ',' pytrends = TrendReq() if a not in words: if request.GET.get('y') == 'year': date = 'today 12-m' elif request.GET.get('y') == 'month': date = 'today 1-m' else: date = 'now 7-d' word1 = words list1 = word1 print(f'{date}') pytrends.build_payload(list1, cat=0, timeframe=f'{date}', geo='', gprop='') print('2222222222222222') value = pytrends.interest_over_time() del value['isPartial'] value = value.reset_index() value2 = value.to_json(force_ascii=False, orient='split', date_format='iso', date_unit='s') abc = json.loads(value2) ab = [] cd = [] for a in abc['data']: k = {} h = datetime.strptime(a[0], '%Y-%m-%dT%H:%M:%SZ') h2 = h.strftime('%Y-%m-%d %H:%M:%S') k['label'] = h2 k['y'] = a[1] k['link'] = '/anal' ab.append(k) context = {'ab': ab, 'word1': word1} return render(request, 'tot/graph.html', context) elif a in words: pass words = words.split(',') word1 = words[0] word2 = words[1] if request.GET.get('y') == 'year': date = 'today 12-m' elif request.GET.get('y') == 'month': date = 'today 1-m' else: date = 'now 7-d' list1 = [word1, word2] pytrends.build_payload(list1, cat=0, timeframe=f'{date}', geo='', gprop='') value = pytrends.interest_over_time() del value['isPartial'] value = value.reset_index() value2 = value.to_json(force_ascii=False, orient='split', date_format='iso', date_unit='s') abc = json.loads(value2) ab = [] cd = [] for a in abc['data']: k = {} z = {} h = datetime.strptime(a[0], '%Y-%m-%dT%H:%M:%SZ') h2 = h.strftime('%Y-%m-%d %H:%M:%S') k['label'] = h2 k['y'] = a[1] k['link'] = '/anal' ab.append(k) z['label'] = h2 z['y'] = a[2] z['link'] = '/anal' cd.append(z) context = {'ab': ab, 'cd': cd, 'word1': word1, 'word2': word2} return render(request, 'tot/graph.html', context)
databegin = list(map(formatter, range(0, 19, 3))) dataend = list(map(formatter, range(4, 25, 3))) for w in range(306, 373): daysprior = w #related to 24/06/2018 - so starting is 21/06/2018 daysbefore = datetime.date.today() - datetime.timedelta(days=daysprior) keywords = ["blockchain"] for i in range(0, len(databegin)): begin = daysbefore.strftime("%Y-%m-%d") + "T" + databegin[i] end = daysbefore.strftime("%Y-%m-%d") + "T" + dataend[i] timeframestring = begin + " " + end for j in range(0, len(keywords)): pytrend.build_payload(kw_list=[keywords[j]], timeframe=timeframestring) df = pytrend.interest_over_time() df.to_csv("../data/" + keywords[j] + "/" + timeframestring + ".csv") begin = daysbefore.strftime("%Y-%m-%d") + "T21" end = (datetime.date.today() - datetime.timedelta(days=daysprior - 1)).strftime("%Y-%m-%d") + "T01" timeframestring = begin + " " + end for j in range(0, len(keywords)): pytrend.build_payload(kw_list=[keywords[j]], timeframe=timeframestring) df = pytrend.interest_over_time() df.to_csv("../data/" + keywords[j] + "/" + timeframestring + ".csv")
from pytrends.request import TrendReq pytrend = TrendReq(hl='de', tz=390, retries=10, backoff_factor=0.5) keywords = ['foo', 'bar', 'dummy'] for keyword in keywords: try: pytrend.build_payload(kw_list=[keyword], geo='DE', timeframe='now 1-d') gbl = globals() for i in range(len(keywords)): gbl['df_' + str(i)] = pytrend.interest_over_time() gbl['df_' + str(i)] = gbl['df_' + str(i)].drop( labels=['isPartial'], axis='columns') print(keyword + ' was succesfully pulled from Google Trends') except Exception as e: print(keyword + ' was not successfully pulled because of the following error: ' + str(e)) continue
class GTrends: def __init__(self, encoding, tz, timeout_connect, timeout_read, retries, backoff_factor, geo, dbx): #initialize google trends connector self.pytrends = TrendReq(hl=encoding, tz=tz, timeout=(timeout_connect, timeout_read), retries=retries, backoff_factor=backoff_factor) self.geo = geo self.dbx = dbx def set_log(self, log): self.log = log #download file for ticker, category_name and time frame #if file already exists, do not download again def download_file(self, ticker, category_name, frame, local_path, monthly=False): try: self.pytrends.build_payload([ticker], cat=category_name, timeframe=frame, geo=self.geo, gprop='') self.log.info( 'Downloading data for ticker %s, category %s, frame %s' % (ticker, category_name, frame)) data = self.pytrends.interest_over_time() df = pd.DataFrame(data) if (df.shape[0] == 0): self.log.info( 'Empty file for ticker %s, category %s, frame %s' % (ticker, category_name, frame)) df.to_csv( local_path, index=False ) #save an empty file so next time do not make the request return True if 'isPartial' in df.columns: df.drop('isPartial', axis=1, inplace=True) df.reset_index(level=0, inplace=True) df['ticker'] = ticker if (monthly): df['date'] = pd.to_datetime(df['date']) df['date'] = df['date'].apply(lambda x: x.strftime('%Y-%m')) df.to_csv(local_path, index=False) return True except Exception as ex: self.log.error( 'There has been an error downloading tracker %s, category %s, frame %s ex:%s\n%s', ticker, category_name, frame, type(ex), ex) return False #for each ticker, download data monthly and daily for each category def import_data(self, tickers_path, year_from, year_until, categories, data_folder_monthly, data_folder_daily, data_folder_monthly_dropbox, data_folder_daily_dropbox): lines = open(tickers_path).readlines()[1:] year_range = range(int(year_from), int(year_until)) download_all = True for ticker in lines: ticker = ticker.rstrip('\n') self.log.info('Process ticker %s' % (ticker)) for category in categories: category_type = category.split(':') category_name = category_type[0] category_type = category_type[1] if (category_type == 'monthly'): dropbox_path = data_folder_monthly_dropbox + category_name if (not self.dbx.folder_exists(dropbox_path)): self.log.info('Create folder ' + dropbox_path) self.dbx.create_folder(dropbox_path) #download monthly data for all the year ranges file_name = ticker + '_' + category_name + '_monthly.csv' if (self.dbx.file_exists(dropbox_path, file_name)): continue frame = year_from + '-01-01 ' + year_until + '-12-31' download_all = self.download_file( ticker, category_name, frame, data_folder_monthly + file_name, True) if (download_all): files_manager.upload_file( data_folder_monthly + file_name, dropbox_path + '/' + file_name, self.dbx) else: break else: dropbox_path = data_folder_daily_dropbox + category_name if (not self.dbx.folder_exists(dropbox_path)): self.dbx.create_folder(dropbox_path) for year in year_range: #download first daily file for year file_name = ticker + '_' + category_name + '_1_daily.csv' if (self.dbx.file_exists(dropbox_path, file_name)): continue frame = str(year) + '-01-01 ' + str(year) + '-06-30' download_all = self.download_file( ticker, category_name, frame, data_folder_daily + file_name) if (download_all): files_manager.upload_file( data_folder_daily + file_name, dropbox_path + '/' + file_name, self.dbx) else: break #download second daily file for year file_name = ticker + '_' + category_name + '_2_daily.csv' if (self.dbx.file_exists(dropbox_path, file_name)): continue frame = str(year) + '-07-01 ' + str(year) + '-12-31' download_all = self.download_file( ticker, category_name, frame, data_folder_daily + file_name) if (download_all): files_manager.upload_file( data_folder_daily + file_name, dropbox_path + '/' + file_name, self.dbx) else: break if (not download_all): break return download_all
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime from pytrends.request import TrendReq sns.set_style("whitegrid") pytrend = TrendReq() """#Jogos Eletronicos, Buscador""" pytrend.build_payload(kw_list=['Jogos Eletrônicos'], geo='BR', timeframe='all') jogosEle = pytrend.interest_over_time() jogosEle.head() jogosEle.tail() jogosEle.shape jogosEle['2020-01-01': '2020-10-01'] plt.figure(figsize=(20,8)) plt.plot(jogosEle['Jogos Eletrônicos']) plt.grid(True) plt.title("Número de Pesquisas por Ano") plt.ylabel("Número de Pesquisas") plt.xlabel("Data")
dates_list = [ '2014-01-01 2014-09-01', '2014-09-01 2015-05-01', '2015-05-01 2016-01-01', '2016-01-01 2016-09-01', '2016-09-01 2017-05-01', '2017-05-01 2018-01-01', '2018-01-01 2018-04-08', ] kw_list = ["Blockchain", "Bitcoin", "Etherium", "crypto", "cryptocurrency"] trends_df = [] for d in dates_list: pytrends = TrendReq(hl='en-US', tz=360) pytrends.build_payload(kw_list, cat=0, timeframe=d, geo='', gprop='') trends_df.append(pytrends.interest_over_time()) time.sleep(2) trends_df = pd.DataFrame().append(other=trends_df) trends_df = trends_df.loc[~trends_df.index.duplicated()] trends_df.to_csv("google_trends_stats.csv") trends_df['2017-06':'2018-03'].Bitcoin.plot(kind='line', figsize=(12, 6), title='Google Trends', style='b-', use_index=True)
from pytrends.request import TrendReq import matplotlib.pyplot as plt import pandas as pd import os # 검색 keyword, 검색 지역, 검색 기간 입력 keyword1 = "신라면" keyword2 = "진라면" local_area = "KR" period = "today 5-y" # Google Trend 접속 및 데이터 탑재 trend_obj = TrendReq() trend_obj.build_payload(kw_list=[keyword1, keyword2], timeframe=period, geo=local_area) trend_df = trend_obj.interest_over_time() # matplotlib 한글 폰트 오류 문제 해결 from matplotlib import font_manager, rc cwd = os.getcwd() font_path = os.path.join(cwd, "data","malgun.ttf") #폰트파일의 위치 font_name = font_manager.FontProperties(fname=font_path).get_name() rc('font', family=font_name) # 그래프 출력 plt.style.use("ggplot") plt.figure(figsize=(14,5)) trend_df[keyword1].plot() trend_df[keyword2].plot() plt.title("Google Trends: %s vs. %s" % (keyword1, keyword2), size=15) plt.legend(loc="best")
def getSearchesOverTime(searchQuery, startAndEndDates): pytrend = TrendReq(GOOGLE_USERNAME, GOOGLE_PASSWORD, custom_useragent="get google trends data script") pytrend.build_payload(kw_list=[searchQuery], timeframe=startAndEndDates) return pytrend.interest_over_time()
def download_ght_by_country_today(kw_list): pytrend = TrendReq() pytrend.build_payload(kw_list, geo='US',timeframe='today 5-y') df = pytrend.interest_over_time() return df
class KeywordOverTimeTrends: '''KeywordOverTimeTrends This class library will try to dissect time information for a given keyword. By extracting time series data from google trend for a particular keyword, we wish to learn about the time characteristics of this keyword and to see if there is any seasonal or trends with such keyword. ''' def __init__(self): plt.switch_backend('Agg') self.pytrends = TrendReq() self.keyword = None self.df = None self.smodel = None self.sresult = None self.amodel = None self.aresult = None def start_search(self, keyword): self.keyword = keyword log_msg = 'starting keyword search for:' + self.keyword logger.info(log_msg) self.get_pytrend_data() logger.info('cleaning dataframe...') self.clean_df() logger.info('building SARIMA...') self.build_SARIMA() # logger.info('building ARIMA...') # self.build_ARIMA() logger.info('finish kot models!') def get_pytrend_data(self): self.pytrends.build_payload(kw_list=[self.keyword]) self.df = self.pytrends.interest_over_time() def get_test_data(self, n=5): return self.df.sample(n) def clean_df(self): self.df.drop(['isPartial'], axis=1, inplace=True) self.df.index.freq = 'W' # Use the above instead of resampling it, which takes time. # self.df = self.df.resample('W').mean() def show_time_series_plot(self): self.df.plot(figsize=GRAPH_PARAMS_FIGSIZE, linewidth=GRAPH_PARAMS_LINEWIDTH, fontsize=GRAPH_PARAMS_FONTSIZE) plt.xlabel('Year', fontsize=GRAPH_PARAMS_FONTSIZE) return plt def show_time_series_plot_in_html(self): return plt_to_html(self.show_time_series_plot()) # we will just use decomposition plots def show_rolling_average_plot(self): self.df.rolling(ROLLING_AVG).mean().plot(figsize=GRAPH_PARAMS_FIGSIZE, linewidth=GRAPH_PARAMS_LINEWIDTH, fontsize=GRAPH_PARAMS_FONTSIZE) plt.xlabel('Year', fontsize=GRAPH_PARAMS_FONTSIZE) plt.legend(['Rolling Avg 52wks']) return plt def show_rolling_average_plot_in_html(self): return plt_to_html(self.show_rolling_average_plot()) # we will just use decomposition plots def show_first_order_diff_plot(self): self.df.diff().plot(figsize=GRAPH_PARAMS_FIGSIZE, linewidth=GRAPH_PARAMS_LINEWIDTH, fontsize=GRAPH_PARAMS_FONTSIZE) plt.xlabel('Year', fontsize=GRAPH_PARAMS_FONTSIZE) return plt def show_first_order_diff_plot_in_html(self): return plt_to_html(self.show_first_order_diff_plot()) # we will just use decomposition plots def show_autocorrelation_plot(self): return pd.plotting.autocorrelation_plot(self.df) def show_autocorrelation_plot_in_html(self): return plt_to_html(self.show_autocorrelation_plot()) def show_decomposition_plot(self): rcParams['figure.figsize'] = 12, 10 decomposition = sm.tsa.seasonal_decompose(self.df, model='additive') decomposition.plot() return plt def show_decomposition_plot_in_html(self): return plt_to_html(self.show_decomposition_plot()) def build_SARIMA(self): self.smodel = sm.tsa.statespace.SARIMAX(self.df, order=SARIMA_ORDER, seasonal_order=SARIMA_SEASONAL_ORDER, enforce_stationarity=False, enforce_invertibility=False) self.sresult = self.smodel.fit() # print(results.summary().tables[1]) def show_SARIMA_diagnostics_plot(self): self.sresult.plot_diagnostics(figsize=GRAPH_PARAMS_FIGSIZE) return plt def show_SARIMA_diagnostics_plot_in_html(self): return plt_to_html(self.show_SARIMA_diagnostics_plot()) def show_SARIMA_prediction_plot(self): pred_uc = self.sresult.get_forecast(steps=SARIMA_PREDICTION_STEPS) pred_ci = pred_uc.conf_int() ax = self.df.plot(label='Observed', figsize=GRAPH_PARAMS_FIGSIZE) pred_uc.predicted_mean.plot(ax=ax, label='Forecast') ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.25) ax.set_xlabel('Year') ax.set_ylabel(self.keyword) plt.legend() #plt.show() return plt def show_SARIMA_prediction_plot_in_html(self): return plt_to_html(self.show_SARIMA_prediction_plot()) def show_all_plots_in_html(self): a = self.show_time_series_plot_in_html() b = self.show_rolling_average_plot_in_html() c = self.show_decomposition_plot_in_html() d = self.show_SARIMA_diagnostics_plot_in_html() e = self.show_SARIMA_prediction_plot_in_html() return {"tseries": a, "rolling": b, "decompose": c, "s-diag": d, "s-pred": e} def clear_all_plots(self): logger.info('all plots cleared!') plt.close('all')
Utilizes Google Analytics database to pull traffic volume of a given keyword within a given timeframe. """ import pandas as pd from pytrends.request import TrendReq from pandas.io.json._normalize import nested_to_record from pandas import DataFrame import matplotlib import datetime as dt #list of keywords, if list is too long it might return an error 400 (seems like 5 keywords or less works) keywords = [ 'cbdoil', 'cannabis', 'patchouli', 'ylang ylang oil', 'lavender oil' ] #requests the keyword search volume from the last three months pytrend = TrendReq(hl='en-US', tz=360) pytrend.build_payload(kw_list=keywords, cat=0, timeframe='today 3-m', gprop='') data = pytrend.interest_over_time() data = data.drop(labels=['isPartial'], axis='columns') #creates visual plot of traffic over time, remove triple quotes to create the image file """ image = data.plot(title = 'Traffic from the Last 3 Months') fig = image.get_figure() fig.savefig('traffic.png') """ #outputs the search results to a csv file data.to_csv('test.csv', sep=';', encoding='utf_8_sig', header=True)
pass pass print("Targets: \nFirst Date: " + target_date_1 + "\nSecond Date: " + target_date_2 + "\nCountry: " + target_country + "\nState: " + target_state + "\nCounty Code: " + target_county + "\n") print("Loaded keywords: ") print(keywords_read()) # data requests while collect_cycle != len(keywords_read()): pytrend.build_payload( [keywords_read()[collect_cycle], keywords_read()[collect_cycle + 1], keywords_read()[collect_cycle + 2], keywords_read()[collect_cycle + 3], keywords_read()[collect_cycle + 4]], timeframe = target_date_1 + " " + target_date_2, geo = target_country + "-" + target_state + "-" + target_county) collection_list.append(pytrend.interest_over_time()) collect_cycle += 5 pass # merging dataframes export_dataframe = pandas.concat(collection_list, axis = 1) # remove isPartial columns del export_dataframe["isPartial"] # exports if arguments.CSV_out == 1: print("Exporting to CSV...") export_dataframe.to_csv(str(randint(1, 9999999)) + ".csv") pass
def get_google_trends(self, kw_list, trdays=250, overlap=100, cat=0, geo='', tz=360, gprop='', hl='en-US', sleeptime=1, isPartial_col=False, from_start=False, scale_cols=True): """Retrieve daily google trends data for a list of search terms Parameters ---------- kw_list : list of search terms (max 5)- see pyTrends for more details trdays : the number of days to pull data for in a search (the max is around 270, though the website seems to indicate 90) overlap : the number of overlapped days when stitching two searches together cat : category to narrow results - see pyTrends for more details geo : two letter country abbreviation (e.g 'US', 'UK') default is '', which returns global results - see pyTrends for more details tz : timezone offset (default is 360, which corresponds to US CST - see pyTrends for more details) grop : filter results to specific google property available options are 'images', 'news', 'youtube' or 'froogle' default is '', which refers to web searches - see pyTrends for more details hl : language (e.g. 'en-US' (default), 'es') - see pyTrends for more details sleeptime : when stiching multiple searches, this sets the period between each isPartial_col : remove the isPartial column (default is True i.e. column is removed) from_start : when stitching multiple results, this determines whether searches are combined going forward or backwards in time (default is False, meaning searches are stitched with the most recent first) scale_cols : google trend searches traditionally returns scores between 0 and 100 stitching could produce values greater than 100 by setting this to True (default), the values will range between 0 and 100 Returns ------- pandas Dataframe Notes ----- This method is essentially a highly restricted wrapper for the pytrends package Any issues/questions related to its use would probably be more likely resolved by consulting the pytrends github page https://github.com/GeneralMills/pytrends """ if len(kw_list)>5 or len(kw_list)==0: raise ValueError("The keyword list can contain at most 5 words") if trdays>270: raise ValueError("trdays must not exceed 270") if overlap>=trdays: raise ValueError("Overlap can't exceed search days") stich_overlap = trdays - overlap from_date = datetime.datetime.strptime(self.from_date, '%Y-%m-%d') to_date = datetime.datetime.strptime(self.to_date, '%Y-%m-%d') n_days = (to_date - from_date).days # launch pytrends request _pytrends = TrendReq(hl=hl, tz=tz) # get the dates for each search if n_days <= trdays: trend_dates = [' '.join([self.from_date, self.to_date])] else: trend_dates = ['{} {}'.format( (to_date - datetime.timedelta(i+trdays)).strftime("%Y-%m-%d"), (to_date - datetime.timedelta(i)).strftime("%Y-%m-%d")) for i in range(0,n_days-trdays+stich_overlap, stich_overlap)] if from_start: trend_dates = trend_dates[::-1] try: _pytrends.build_payload(kw_list, cat=cat, timeframe=trend_dates[0], geo=geo, gprop=gprop) except Exception as e: return pd.DataFrame({"error":e}, index=[0]) output = _pytrends.interest_over_time().reset_index() if len(output)==0: return pd.DataFrame({"error":'search term returned no results (insufficient data)'}, index=[0]) for date in trend_dates[1:]: time.sleep(sleeptime) try: _pytrends.build_payload(kw_list, cat=cat, timeframe=date, geo=geo, gprop=gprop) except Exception as e: return pd.DataFrame({"error":e}, index=[0]) temp_trend = _pytrends.interest_over_time().reset_index() temp_trend = temp_trend.merge(output, on="date", how="left") # it's ugly but we'll exploit the common column names # and then rename the underscore containing column names for kw in kw_list: norm_factor = np.ma.masked_invalid(temp_trend[kw+'_y']/temp_trend[kw+'_x']).mean() temp_trend[kw] = temp_trend[kw+'_x'] * norm_factor temp_trend = temp_trend[temp_trend.isnull().any(axis=1)] temp_trend['isPartial'] = temp_trend['isPartial_x'] output = pd.concat([output, temp_trend[['date', 'isPartial'] + kw_list]], axis=0) # reorder columns in alphabetical order output = output[['date', 'isPartial']+kw_list] if not isPartial_col: output = output.drop('isPartial', axis=1) output = output[output['date']>=self.from_date] if scale_cols: # the values in each column are relative to other columns # so we need to get the maximum value across the search columns max_val = float(output[kw_list].values.max()) for col in kw_list: output[col] = 100.0*output[col]/max_val output = output.sort_values('date', ascending=self.ascending).reset_index(drop=True) return output
## FIRST RUN ## # Login to Google. Only need to run this once, the rest of requests will use the same session. pytrend = TrendReq() # Run the first time (if we want to start from today, otherwise we need to ask for an end_date as well today = datetime.today().date() old_date = today # Go back in time new_date = today - timedelta(days=step) # Create new timeframe for which we download data timeframe = new_date.strftime('%Y-%m-%d')+' '+old_date.strftime('%Y-%m-%d') pytrend.build_payload(kw_list=kw_list, timeframe = timeframe) interest_over_time_df = pytrend.interest_over_time() ## RUN ITERATIONS while new_date>start_date: ### Save the new date from the previous iteration. # Overlap == 1 would mean that we start where we # stopped on the iteration before, which gives us # indeed overlap == 1. old_date = new_date + timedelta(days=overlap-1) ### Update the new date to take a step into the past # Since the timeframe that we can apply for daily data # is limited, we use step = maxstep - overlap instead of # maxstep.