def scrape(): # load queries hpv_queries = load_file('hpv_queries.txt').split('\n') mfr_queries = load_file('mfr_queries.txt').split('\n') trends = pyGTrends # connect to google trends = pyGTrends(USER, PASS) # scrape hpv hpv_matrix = [] for query in hpv_queries: trends.request_report(keywords=query, date='01/2011 60m', geo='DK') raw = trends.get_data() hpv_matrix.append(parse_counts(raw)) hpv_matrix = np.array(hpv_matrix) hpv_matrix = np.transpose(hpv_matrix) np.savetxt(DATA_DIR + 'hpv_data.txt', hpv_matrix, fmt='%i', delimiter=",") # scrape mfr mfr_matrix = [] for query in mfr_queries: trends.request_report(keywords=query, date='01/2011 60m', geo='DK') raw = trends.get_data() mfr_matrix.append(parse_counts(raw)) mfr_matrix = np.array(mfr_matrix) mfr_matrix = np.transpose(mfr_matrix) np.savetxt(DATA_DIR + 'mfr_data.txt', mfr_matrix, fmt='%i', delimiter=",")
def get_day_trends(): time.sleep(float(initial_sleep)) start = time.time() # Google Trends client logger.info("Connecting to Google") connector = pyGTrends('', '') logger.info("Connected to Google") df = pd.DataFrame() counter = 0 kill_at = pd.Timestamp(strftime("%Y-%m-%d 21:00:00")) spent_time_connection = time.time() - start - (start % 1) try: while True: start = time.time() now = pd.Timestamp(ctime()) if now > kill_at: logger.info("Killing job...") break try: current_trends = get_trends(connector) df = df.append(current_trends) counter += current_trends.shape[0] except: logger.error("Error requesting latest trends: {}".format(str(sys.exc_info()[0]))) logger.error(traceback.format_exc()) # Print a status message if counter % 1000 == 0: logger.info("{} stories fetched".format(counter)) spent_time = time.time() - start logger.info("Current job took {}\n".format(spent_time)) if spent_time_connection > 0: time.sleep(60 - spent_time - spent_time_connection) spent_time_connection = 0 else: time.sleep(60 - spent_time - (start % 1)) except: logger.error("Exiting: " + str(sys.exc_info()[0])) logger.error(traceback.format_exc()) beginning = str(df.iloc[0]['timestamp']) end = str(df.iloc[df.shape[0] - 1]['timestamp']) df.index.name = 'rank' df.to_csv("trends({})[{}, {}].csv".format(hl_param, beginning, end)) logger.info("Saved day trends from {} to {}".format(beginning, end))
def google_trend_crawler(name): #suggestions = {} google_username = "******" google_password = "******" title = name.replace('_', ' ') title = title.translate(None, '():') title = urllib.unquote(title) title = title.decode('utf-8') title = title.encode("cp1252") connector = pyGTrends(google_username, google_password) connector.request_report(title, hl='en-US') suggestions = connector.get_suggestions(title) if suggestions['default']['topics'] != []: title = suggestions['default']['topics'][0].get('mid') # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10)) print title connector.request_report(title, hl='en-US') name = name.translate(None, ':*\'') name = name.decode("cp1252") name = urllib.quote_plus(name.encode("utf-8")) connector.save_csv(scientisis_dir, name) else: with open(errorfile, "a") as myfile: myfile.write("%s\n" % name) return
def __init__(self, searchterms): self.trends = [] self.trends.append(searchterms) con = pyGTrends("trendsscraper123", "googleTrends") con.request_report(self.formatSearchTerms(searchterms), date = "today 7-d") data = con.get_data() self.trends += self.processScore(data)
def __init__(self, file_paths, credentials): # Set file paths self.file_paths = file_paths wb_filepath = os.path.abspath(file_paths['input_file']) # Read an input workbook self.in_workbook = openpyxl.load_workbook(filename=wb_filepath) # Read country codes. self.country_codes = self.get_country_codes(file_paths['country_codes_file']) # Connect to Google self.g_connector = pyGTrends(credentials['google_username'], credentials['google_password'])
def __init__(self): # connect to Google try: self.google_username = "******" self.google_password = "******" self.connector = pyGTrends(self.google_username, self.google_password) time.sleep(randint(5, 10)) except: print('I am unable to connect to google trends.') sys.exit(-1)
def __init__(self, file_paths, credentials): # Set file paths self.file_paths = file_paths wb_filepath = os.path.abspath(file_paths['input_file']) # Read an input workbook self.in_workbook = openpyxl.load_workbook(filename=wb_filepath) # Read country codes. self.country_codes = self.get_country_codes( file_paths['country_codes_file']) # Connect to Google self.g_connector = pyGTrends(credentials['google_username'], credentials['google_password'])
def get_trend_score(query, horizon=52, trail=3): try: connector = pyGTrends(_GNAME, _GPASS) connector.request_report(query) sleep(5) stock_data = parse_data(connector.decode_data) trend = TrendAnalysis(query, stock_data) trend.set_stats() return trend except Exception as e: _log.error(e) return "N/A"
def __init__(self, path, startyear): self.startyear = startyear # ADD YOUR ACCOUNT INFOS self.google_username = "" self.google_password = "" if not os.path.exists(path): os.mkdir(path) self.logfilename = path+"log-fails.txt" self.connector = pyGTrends(self.google_username, self.google_password) self.path = path
def main(): name = sys.argv[1] if len(sys.argv) >= 2: for argument in sys.argv[2:]: name += " " + argument newpath = "GoogleTrendsData/" + name if not os.path.exists(newpath): os.makedirs(newpath) path = "GoogleTrendsData/" path += name + "/" csv_name = name + "_trend" cleaned_csv_name = name + "_trend_cleaned.csv" # connect to Google connector = pyGTrends(google_username, google_password) # make request connector.request_report(name) # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10)) # download file connector.save_csv("../../src/main/resources/static/", "google_trend") with open("../../src/main/resources/static/google_trend.csv") as in_file: csv_reader = csv.reader(in_file) for i in range(5): next(csv_reader) dates_mapping = defaultdict(int) for line in csv_reader: if not line: break dateString = line[0] dates = dateString.split(' - ') start_dates = dates[0].split('-') start_year = start_dates[0] start_month = start_dates[1] dates_mapping[start_year + "/" + start_month] += int(line[1]) with open("../../src/main/resources/static/google_trend_cleaned.csv", "w") as out_file: csv_writer = csv.writer(out_file) csv_writer.writerow(['time', 'quantity']) for item in sorted(dates_mapping.items()): csv_writer.writerow([item[0], item[1]])
def generateCSV(self): google_username = "******" google_password = "******" path = "/Users/AnnaGupta/hack-cmu-2015/pytrends-master/examples/" # connect to Google connector = pyGTrends(google_username, google_password) # make request connector.request_report("%s" % self.word, hl="en-US", cat=None, geo="US", date=None) # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10)) # download file connector.save_csv(path, self.word)
def generateCSV(self): google_username = "******" google_password = "******" path = os.getcwd()+"/" # connect to Google connector = pyGTrends(google_username, google_password) # make request connector.request_report("%s" % self.word, hl='en-US', cat=None, geo='US', date="today 7-d") # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10)) # download file connector.save_csv(path, "data")
def mongo_write(): client = pymongo.MongoClient('localhost', 27017) db = client['googletrends'] collection = db['genre_query_history'] # path = "" # results=get_all_freebase_genres() # print results count = 0 # # # print len(results["results"]["bindings"]) for result in collection.find(): count += 1 print count # print result # break fid = result['freebase_id'] # fid='/m/0bkbm' print fid if 'query_overtime' not in result: continue if 'query_us_states' in result: continue # connect to Google connector = pyGTrends(google_username, google_password) # make request connector.request_report(fid, geo='US') print res_json = csv2json(connector.decode_data.split('\n')) res_json['freebase_id'] = fid # print res_json#['query_regions'] # collection.find_one({'query_title': fid}).update() # if collection.find_one({'query_title': fid}) == None: # collection.insert_one(res_json) if 'query_us_states' in res_json: genre = collection.find_one({'freebase_id': fid}) # print genre genre['query_us_states'] = [] genre['query_us_states'] = res_json['query_us_states'] collection.save(genre) # break # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10))
def get_top_date(actor_name, peak_num): google_username = "******" google_password = "******" fid = es_dao.get_actor_freebase_id_by_name(actor_name) print fid connector = pyGTrends(google_username, google_password) # make request connector.request_report(fid) res_json = csv2json(connector.decode_data.split('\n')) res_json['freebase_id'] = fid #print "Here is result in json:", res_json overtime_data = [] timestamp_start = [] timestamp_end = [] for weekly_data in res_json['query_overtime']: overtime_data.append(int(weekly_data['querycount'])) timestamp_start.append(weekly_data['starttime']) timestamp_end.append(weekly_data['endtime']) #print res_json['query_overtime'] # print timestamp # print overtime_data sorted_data = sorted(range(len(overtime_data)), key=lambda i: overtime_data[i])[-peak_num:] # print sorted_data for index in sorted_data: print "date:{}, count{}".format(timestamp_start[index], overtime_data[index]) # array_data = np.asarray(overtime_data) # peakind =signal.find_peaks_cwt(array_data, np.arange(5,10)) # print peakind # plt.plot(overtime_data) # plt.show() # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10)) return sorted_data, overtime_data, timestamp_start, timestamp_end
def run(self): google_username = "******" google_password = "******" path = "/home/vagner/workspace/ITSGoogleTrends/output/" # connect to Google try: self.show("Realizando conexão com usuário " + google_username) connector = pyGTrends(google_username, google_password) self.show("Conexão realizada com sucesso") except Exception as e: raise ITSGoogleTrendsError ("Erro durante a conexão com o Google.") #montando a string de requisicao # Lendo os dados do arquivo csv table = rows.import_from_csv(self._CSV_FILE_PATH + self._CSV_FILE_NAME) rows_number = len(table) index = 0 for row in table: its_name = str(row.system).lower() index = index + 1 self.show("Início da busca dos dados de tendência para o ITS: {0:s} [{1:d}/{2:d}]".format(its_name,index,rows_number)) str_request = self._DEFAUT_KEYWORD + "," + its_name self.show("Realizando uma requisição com a sentença " + str_request) # make request connector.request_report(str_request) # download file self._now = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) csv_file_name = "{0:s}-{1:s}".format(self._now, str_request.replace(",", "-").replace(" ","-")) connector.save_csv(path, csv_file_name) self.show("Resultados escritos no arquivos {0:s}.csv".format(csv_file_name)) # wait a random amount of time between requests to avoid bot detection wait_time = randint(5, 10) self.show("Aguardando {0:d} segundos para uma nova requisição".format(wait_time)) time.sleep(wait_time) #end for self.show("Execução realizada com sucesso!")
def GetGoogleTrends(subject, page_id, deleteTempFile = False): """ Grabs data from Google trends for a given topic. Creates a temporary csv file that can optionall be deleted after use. inputs subject: topic to search (string) deleteTempFile: if True, deltes temporary file created after use output Dataframe of number of searches for each week """ subject = re.sub(',', '', subject) google_username = "******" google_password = "******" path = '%s_temp.csv' % subject connector = pyGTrends(google_username, google_password) connector.request_report(subject) data = connector.decode_data # so... data is a string intended to become a csv file # the first four lines are useless and need to be removed # there is likely a better way to do this than what I do below, but it works data = re.sub('Web.*', '', data) data = re.sub('World.*', '', data) data = re.sub('World.*', '', data) data = re.sub('Interest.*', '', data) data = re.sub('\n\n\n\n', '', data) data = re.split('.*, \n', data)[0] data = re.split('\n\n', data)[0] # write temporary file then open in pandas... this is just easier, but might be a better way f = open(path, 'w+') f.write(data) f.close() try: df = pd.read_csv(path) except: return None df.columns = ['Week', 'Searches'] df['Week'] = [re.sub('-', '', re.split(' - ', week)[0]) for week in df['Week']] df['page_id'] = [page_id for week in df['Week']] df = df[['page_id', 'Week', 'Searches']] if deleteTempFile: os.remove(path) return df
def get_top_date(actor_name, peak_num): google_username = "******" google_password = "******" fid = es_dao.get_actor_freebase_id_by_name(actor_name) print fid connector = pyGTrends(google_username, google_password) # make request connector.request_report(fid) res_json=csv2json(connector.decode_data.split('\n')) res_json['freebase_id']=fid #print "Here is result in json:", res_json overtime_data = [] timestamp_start =[] timestamp_end =[] for weekly_data in res_json['query_overtime']: overtime_data.append(int(weekly_data['querycount'])) timestamp_start.append(weekly_data['starttime']) timestamp_end.append(weekly_data['endtime']) #print res_json['query_overtime'] # print timestamp # print overtime_data sorted_data = sorted(range(len(overtime_data)), key=lambda i: overtime_data[i])[-peak_num:] # print sorted_data for index in sorted_data: print "date:{}, count{}".format(timestamp_start[index], overtime_data[index]) # array_data = np.asarray(overtime_data) # peakind =signal.find_peaks_cwt(array_data, np.arange(5,10)) # print peakind # plt.plot(overtime_data) # plt.show() # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10)) return sorted_data, overtime_data, timestamp_start, timestamp_end
def main(): googleUsername = "******" googlePassword = "******" #connect to Google connector = pyGTrends(googleUsername, googlePassword) with open("listFinal.txt", "r") as f: #creates array each containing a card name as a string cards = [line.rstrip('\n') for line in f] for card in cards: connector.request_report(card,geo="US", date="today 90-d") connector.save_csv("./output/", card) #so that google trends doesn't get suspicious of scripting... time.sleep(randint(4,8)) path = "./output" os.chdir(path) #gets rid of all the junk in the csv file for file in glob.glob("*.csv"): f = open(file,"r") lines = f.readlines() f.close() del(lines[0:5]) lines = lines[0:90] f = open(file,"w") for line in lines: f.write(line)
def main(keywords): google_username = "******" google_password = "******" path = "" # regions = ['US-AK-743', 'US-AK-745', 'US-AK-747'] regions = ['US-AK-743'] sample_counter = 1 # connect to Google connector = pyGTrends(google_username, google_password) print "connected" # make request for region in regions: while sample_counter < 10: connector.request_report(keywords, hl='en-US', geo=region) print "waiting..." time.sleep(randint(5, 10)) print "saving..." # download file connector.save_csv(path, region + '_' + str(sample_counter)) print "done with iteration", sample_counter sample_counter += 1
# # Install requirements with pip # # # from pytrends.pyGTrends import pyGTrends import time import os from random import randint import pandas as pd # Add your Gmail username to the google_username variable and your Gmail password to the google_password variable. google_username = "" google_password = "" connector = pyGTrends(google_username, google_password) # This script downloads a series of CSV files from Google Trends. Please specify a filepath for where you'd like these files to be stored in the below variable. path = "" # Specify the filename of a CSV with a list of keywords in the variable, keyordcsv. The CSV should be one column, with header equal to Keywords (case sensitive). keywordcsv = "keywords.csv" keywords = pd.read_csv(keywordcsv) # Downloads and Calculate Slope: keywordlist = pd.DataFrame(columns=["keyword", "slope"]) for index, row in keywords.iterrows(): print("Downloading Keyword #" + str(index)) payload = {'geo': 'US', 'q': [row[0]]} connector.request_report(payload) time.sleep(randint(5, 10))
#If weekly data, convert to monthly. wordfile = onlymonths(wordfile,monthsnum) if len(wordfile) > 0: months = wordfile wordfilenum = map(lambda x: x.split(",")[1],wordfile) result[:,i] = np.array(wordfilenum) time.sleep(randint(2, 5)) words = np.array(words)[dont_skip] months = map(lambda x: x.split(",")[0],months) result = result[:,dont_skip] return (result,words,months) def main(text1,text2,name,google): words = tokenize(text1,text2) save_words(name,words) data = getData(words,name,google) save_csv("../data/" + name + "dat.csv",data[0],data[1],data[2]) print "done with " + name if __name__ == '__main__': #getData([u"æøå"],"") print "Talking with google. This takes time!" google_username = "******" google_password = "******"#" google = pyGTrends(google_username, google_password) main("../data/MFR1.txt","../data/MFR2.txt","MFR",google) #main("../data/DiTe1.txt","../data/DiTe2.txt","DiTe",google) main("../data/HPV1.txt","../data/HPV2.txt","HPV",google) #main("../data/PCV1.txt","../data/PCV2.txt","PCV",google)
def run(type, online=True): # Lasso Prediction - tolerance set to avoid converge warning model = linear_model.Lasso(tol=0.001) tmp = [] google_username = "******" # We Be Anomynous google_password = "******" # Location for .csv path = "trends/"+type+"/" # Using pytrends for gathering data from Google - but only if online, else uses local data # connect to Google if online: connector = pyGTrends(google_username, google_password) # tokenizer will determine the trends needed for trend in tokenizer.run(type): # file names should avoid danish specialcases name=trend.replace('ø','oe').replace('æ','ae').replace('å','aa') if online: # make request connector.request_report(str(trend), hl='dk', geo='DK', date="01/2011 57m") # wait a random amount of time between requests to avoid bot detection time.sleep(randint(3, 6)) # download file connector.save_csv(path, name) # Once a csv file has been recovered. Extract the monthly information months = regular.GetArrayFromFile(""+path+name+".csv") if months != None: tmp.append(months) # Convert the tmp list to a numpy array of proper dimension X = np.array(tmp).transpose() # Extract clinical data json_pattern = re.compile('[0-9]+\.[0-9]+') f = open("vactionations/"+type+"-1.json", "r") data = f.read() f.close() match = json_pattern.findall(data) # Y is now the clinical data Y = np.array([float(x) for x in match][0:len(X)]) # Preform 5-fold crossvalidation k_fold = cross_validation.KFold(len(X), 5) v = 0 plt.figure(type + " 5-fold graphs") for k, (train, test) in enumerate(k_fold): model.fit(X[train], Y[train]) plot_y = model.predict(X[test]) plt.subplot(5, 1, k+1) plt.ylabel("Fold %s" % (k+1)) plot_x = range(len(plot_y)) plt.plot(plot_x, plot_y, color='b', label='Prediction') plt.plot(plot_x, Y[test], color='r', label='Clinical') # RMSE for each fold is summed RMSE = mean_squared_error(Y[test],plot_y) v += RMSE print test # overall RMSE is determined print type, "RMSE", np.sqrt(v/5) # For fun, a full prediction is made, to compare the model after 5 folds # and the ground truth data plt.xlabel("Months") plt.show() plt.figure(type + " full prediction") plot_y = model.predict(X) plot_x = range(len(plot_y)) plt.plot(plot_x, plot_y, color='b', label='Prediction') plt.plot(plot_x, Y, color='r', label='Clinical') plt.xlabel("Months") plt.legend(loc="upper right", fancybox=True) plt.show()
MYUSERNAME = "******" google_username = config.google["User_Name"] google_password = config.google["Password"] logging.basicConfig(filename="logs.log", level=logging.INFO) def now_time(): now = datetime.datetime.now() return now.strftime("[%Y/%m/%d %H:%M:%S]") logging.info("%s Connecting to google..." % now_time()) print("%s Connecting to google..." % now_time()) MYCONNECTOR = pyGTrends(google_username, google_password) logging.info("%s Connected to google" % now_time()) print("%s Connected to google" % now_time()) logging.info("\n") # time to sleep if got a 420 error BACKOFF = 2 backoff = BACKOFF # don't respond to queries from these accounts BLACKLIST = ["pixelsorter", "lowpolybot", "slashkarebear", "slashgif", "slashremindme"] TIME_SPAN_OPTIONS = [str(a) + "y" for a in range(1, 12)] TIME_SPAN_OPTIONS += [str(a) + "m" for a in range(1, 91)] TIME_SPAN_OPTIONS += [str(a) + "d" for a in range(1, 91)]
# connector = pyGTrends(google_username, google_password) server = 'localhost' database = 'GoogleTrends' user = '******' password = '******' log.info('Connecting to data base....!') connection = pymysql.connect(host=server, user=user, password=password, db=database) log.info('We are connected to Database') keywords = 'Policybazaar' log.info('Connecting to Google Trends....!') connector = pyGTrends(google_username, google_password) log.info('Connected to Google Trends.') while True: # connect to Google # connector = pyGTrends(google_username, google_password) # make request log.info('we are inside while loop') try: connector.request_report(keywords, hl='en-US', cat=None, geo='IN', date='now 1-H', tz="Etc/GMT-5:30") log.info('Requested query got...') xx = connector.get_data() yy = str(xx) ll = yy.split('\n')
from pytrends.pyGTrends import pyGTrends import time from random import randint google_username = "******" google_password = "******" path = "" # connect to Google custom_useragent = {'User-Agent': 'My Pytrends Script'} connector = pyGTrends(google_username, google_password, custom_useragent) # make request payload = { 'q': ['Pizza, Italian, Spaghetti, Breadsticks, Sausage'], 'cat': '0-71' } connector.request_report(payload) # wait a random amount of time between requests to avoid bot detection time.sleep(randint(5, 10)) # download file connector.save_csv(path, "pizza") # get suggestions for keywords keyword = "milk substitute" data = connector.get_suggestions(keyword) print(data)
def __init__(self, search_words = None, file_name = "report.csv"): self.save_path = "pytrends/" + file_name self.search_words = search_words self.connector = pyGTrends("*****@*****.**", "gotneedforspeed")