class Trends: def __init__(self, keywords): self.pytrends = TrendReq(hl='uk', tz=360) self.pytrends.build_payload(keywords) def interest_over_time(self): return self.pytrends.interest_over_time() def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False): return self.pytrends.interest_by_region(resolution=resolution, inc_low_vol=inc_low_vol) def related_topics(self): return self.pytrends.related_topics() def related_querries(self): return self.pytrends.related_queries() def trending_searches(self, pn='ukraine'): return self.pytrends.trending_searches(pn=pn) def top_charts(self, date, hl='uk', tz=360, geo='GLOBAL'): return self.pytrends.top_charts(date, hl=hl, tz=tz, geo=geo) def suggestions(self, keyword): return self.pytrends.suggestions(keyword)
def showcompare(name,list): pytrends = TrendReq(hl='en-US') word = unicodedata.normalize('NFKD', next(iter(list))).encode('ascii','ignore') pytrends.build_payload(kw_list=[word], timeframe='today 1-m', geo='US', gprop='news') related_topics_df = pytrends.related_topics()[word] plot_wordcloud(name+"Google",related_topics_df['title'].tolist()) plot_wordcloud(name,list)
def get_data(file_content): global final_dict global related_topic_df pytrends = TrendReq(hl='india', tz=360, retries=5, backoff_factor=0.5) for search in file_content: try: pytrends.build_payload(kw_list=[search], timeframe=f"{start_date} {end_date}", geo='IN') df = pytrends.interest_over_time().reset_index() df.drop(labels=['isPartial'], axis=1, inplace=True) final_dict = pd.concat([final_dict, df], axis=1) related_topic = pytrends.related_topics( )[search]['rising'].reset_index() related_topic_df = pd.concat([related_topic, related_topic_df], ignore_index=True, axis=1) except Exception as error: print(error) if related_topic_df.empty: pass final_dict.to_csv(f'..//..//output/final_df_{today}.csv', index=False) related_topic_df.to_csv(f'..//..//output/related_topic_df_{today}.csv', index=False)
async def google_trends(request): if request.method == 'GET': topic = request.args.get('topic').split(',') lang = request.args.get('lang') region = request.args.get('reg') else: return json({'response': False, 'message': 'Wrong HTTP method', 'results': []}) if topic is None or topic is '': return json({'response': False, 'message': 'Not enough arguments', 'results': []}) from pytrends.request import TrendReq pytrends = TrendReq(hl='{}-{}'.format(lang, region), tz=360) pytrends.build_payload(topic) res = pytrends.interest_over_time() res.drop('isPartial', axis=1, inplace=True) rel_topics = pytrends.related_topics()[topic[0]] rel_topics.drop('mid', axis=1, inplace=True) rel_queries = pytrends.related_queries()[topic[0]] countries = pytrends.interest_by_region(resolution='COUNTRY') countries = countries[(countries.T != 0).any()] from json import loads countries = loads(countries.to_json())[topic[0]] return json({'message':'done', 'response': True, 'result': {'interest': res.to_json(), 'related_topics': rel_topics.to_json(), 'top_queries': rel_queries['top'].to_json(), 'rising_queries': rel_queries['rising'].to_json(), 'countries': countries}})
def parse(self, response): try: pytrends = TrendReq(hl='en-US', tz=360) pytrends.build_payload(self.search_term.split('|||'), cat=self.cat, timeframe='{} {}'.format( self.start_date, self.end_date), geo=self.geo, gprop=self.gprop) df = pytrends.related_topics() keys = ['rising', 'top'] for key in keys: json_data = json.loads( df[self.search_term][key].to_json(orient="table")) for item in json_data['data']: for search_term in self.search_term.split('|||'): yield RelatedTopicsItem( search_term=search_term, search_phrase=item['topic_title'], top=item['formattedValue'] if key == 'top' else '', breakout=item['formattedValue'] if key == 'rising' else '') except Exception as e: print(e) pass
def getTrends(self, q, genType): #build query pytrend = TrendReq() pytrend.build_payload(kw_list=[q]) #get timeseries data df = pytrend.interest_over_time() #create filter for last 2 years y = datetime.today() z = y.replace(year=y.year - 2) after = z.strftime('%Y%m%d') #filter and plot data df = df.query('index > ' + (after)) plot = df.plot() plt.title('Search Interest Over Time') plt.ylabel('Relative Interest') plt.xlabel('Date') filename = q.replace(' ', '_') if genType == 'query': plt.savefig('static/images/query/google.png', bbox_inches='tight') if genType == 'suggested': plt.savefig('static/images/suggested/google_' + str(filename) + '.png', bbox_inches='tight') #get related topics and graph related = pytrend.related_topics() keys = related.keys() for key in keys: toprelated = related[key]['top'].head(10) relatedlist = list(toprelated['topic_title']) trendlist = [] for item in relatedlist: item.replace(' ', '%20') item = "https://trends.google.com/trends/explore?geo=US&q=" + str( item) trendlist.append(item) plot = toprelated.plot(x='topic_title', y='value', kind='bar', legend=None) plt.ylabel('Relative Similarity') plt.xlabel('Related Topics') plt.title('People who searched for ' + str(q) + ' also searched for:') if genType == 'query': plt.savefig('static/images/query/google1.png', bbox_inches='tight') if genType == 'suggested': plt.savefig('static/images/suggested/google1_' + str(filename) + '.png', bbox_inches='tight') return (trendlist)
def get_suggestions_and_topics(participant_name, hl='en-US', geo="", gprop=""): pytrends = TrendReq(hl=hl, tz=360, geo=geo) pytrends.build_payload([participant_name], cat=0, timeframe='2017-06-01 2020-06-01', geo=geo, gprop=gprop) dict_related_topic = pytrends.related_topics() suggestions = pytrends.suggestions(participant_name) return dict_related_topic, suggestions
def __get_top_searches(self, search_term: str): pytrends = TrendReq() kw_list = [search_term] try: pytrends.build_payload(kw_list) web = pytrends.related_topics() # web = pytrends.suggestions(kw_list) except Exception: web = False return web
class Trends: ''' Wrapper class for pytrends library. ''' def __init__(self, keywords): ''' Initialize trends object. ''' self.pytrends = TrendReq(hl='uk', tz=360) self.pytrends.build_payload(keywords) def interest_over_time(self): ''' Get interest overt time. ''' return self.pytrends.interest_over_time() def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False): ''' Get interest by region. ''' return self.pytrends.interest_by_region(resolution=resolution, inc_low_vol=inc_low_vol) def related_topics(self): ''' Get related topics. ''' return self.pytrends.related_topics() def related_querries(self): ''' Get related search querries. ''' return self.pytrends.related_queries() def trending_searches(self, pn='ukraine'): ''' Get trending searhes by country. ''' return self.pytrends.trending_searches(pn=pn) def top_charts(self, date, hl='uk', tz=360, geo='GLOBAL'): ''' Get top charts by date. ''' return self.pytrends.top_charts(date, hl=hl, tz=tz, geo=geo) def suggestions(self, keyword): ''' Get suggestions for keyword. ''' return self.pytrends.suggestions(keyword)
def related_topics(keywords): """ Usage: Return a dictionary of strings(related topics) for each keyword """ related_topics_dict = dict() for keyword in keywords: kwlist = [keyword] pytrends = TrendReq(hl='en-US', tz=360, timeout=(10, 25), retries=2, backoff_factor=0.1) pytrends.build_payload(kwlist) #Prevent pytrend fail try: data = pytrends.related_topics() except: related_string = 'Related Topics: None' related_topics_dict[keyword] = related_string continue print(keyword) #print(data) for x in data: key = x break df = data[key]['top'] #Error exception IsDataEmpty = df.empty if IsDataEmpty == True: related_string = 'Related Topics: None' else: related_topics_list = [] related_topics_list.append('Related topics:') #top 4 related topics for i in range(1, 5): try: item = str(i) + '.' + df.at[i, 'topic_title'] related_topics_list.append(item) except KeyError: break related_string = ' '.join(related_topics_list) related_topics_dict[keyword] = related_string return related_topics_dict
def gtrend_getvalue(kw_list,output_file,timeframe): """ ライブラリを使用してGoogleTrendsからデータを取得する。 #pytrends ref https://pypi.org/project/pytrends/#interest-by-region """ try: sp = kw_list[0] pytrends = TrendReq(hl='ja-JP', tz=360) pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo='JP', gprop='') #関連キーワード trendsdata = pytrends.related_queries() o = output_file s = sp + 'query' exportdata(trendsdata,o,s,1) #関連トピック trendsdata = pytrends.related_topics() s = sp + 'topic' exportdata(trendsdata,o,s,1) #地域別の関心 trendsdata = pytrends.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=False) s = sp + 'region' exportdata(trendsdata,o,s,0) #時系列 trendsdata = pytrends.interest_over_time() s = sp + 'overtime' exportdata(trendsdata,o,s,0) #サジェスト trendsdata = pytrends.suggestions(sp) s = sp + 'suggestions' suggest_to_excel(trendsdata,o,s) #注目キーワード #trendsword = pytrends.trending_searches(pn='united_states') #アメリカ #trendsword = pytrends.trending_searches(pn='japan') #日本 #s = "trendword" #f = exportdata(trendsword,o,s,0) except Exception as e: t, v, tb = sys.exc_info() print(traceback.format_exception(t,v,tb)) print(traceback.format_tb(e.__traceback__))
def get_related_topics(sessiontrend=None, term='', geo='GB', timeframe='today 5-y'): kw_list = [term] if sessiontrend is None: pytrends = TrendReq(hl='en-US', tz=0) pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo=geo, gprop='') else: pytrends = sessiontrend related_topics = pytrends.related_topics() related_topics = related_topics[term]['rising'].values.tolist() #print(len(related_topics), related_topics) # return related_topics return json.dumps(related_topics)
class Trends: def __init__(self): self.pytrends = TrendReq() def get_results(self, keyword): trends = [] try: suggs = self.pytrends.suggestions(keyword) kw_list = [suggs[0]['mid']] self.pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='', gprop='') df = self.pytrends.related_topics() for i in df[suggs[0]['mid']]['top']['topic_title'][:3]: if i.lower() != keyword.lower(): trends.append(i.lower()) except: pass return trends
class Crawler: def __init__(self, url, data): self.url = url self.source_code = requests.get(self.url) self.plain_text = self.source_code.text self.soup = BeautifulSoup(self.plain_text, "html.parser") self.data = data self.modal_data = {} self.modal_images = {} self.modal_keywords = {} self.pytrends = TrendReq(hl='en-US') def crawl(self, maxPages): page = 1 seo_result = {} seo_result_2 = {} while page <= maxPages: page += 1 page_speed_url = "https://www.googleapis.com/pagespeedonline/v2" \ "/runPagespeed?url=" + self.url + \ "&filter_third_party_resources=true&locale=en_US" \ "&screenshot=true&strategy=mobile&key" \ "=AIzaSyALJmgrWTmS3hvq3lxWzWnvo9FimxY-VAY" mobile_ready_url = 'https://www.googleapis.com/pagespeedonline' \ '/v3beta1' \ '/mobileReady?url=' + self.url + '&key=AIzaSyALJmgrWTmS3hvq3lxWzWnvo9FimxY-VAY' page_speed = requests.get(page_speed_url).json() page_speed['screenshot']['data'] = page_speed['screenshot'][ 'data'].replace('_', '/') page_speed['screenshot']['data'] = page_speed['screenshot'][ 'data'].replace('-', '+') seo_result['page_speed'] = page_speed mobile_ready = requests.get(mobile_ready_url).json() mobile_ready['screenshot']['data'] = mobile_ready['screenshot'][ 'data'].replace('_', '/') mobile_ready['screenshot']['data'] = mobile_ready['screenshot'][ 'data'].replace('-', '+') seo_result['mobile_ready'] = mobile_ready seo_result_2['Nested Tables'] = self.check_nested_tables() seo_result_2['Title'] = self.check_title() seo_result_2['Description'] = self.check_description() seo_result_2['<h1> Headings'] = self.check_headings_one() seo_result_2['<h2> Headings'] = self.check_headings_two() seo_result_2['Image Alt Tag'] = self.check_images_alt() Tag.do_not_call_in_templates = True seo_result_2['Broken Links'] = self.check_broken_links() seo_result_2['Bad Urls'] = self.check_urls() seo_result_2['Inline CSS'] = self.check_styles() seo_result_2['Google Analytics'] = self.check_google_analytics() seo_result_2['Favicon'] = self.check_favicon() seo_result_2['HTTPS'] = self.check_https() seo_result_2['Frameset'] = self.check_frames() seo_result_2['Canonical Tag'] = self.check_canonical() seo_result_2['No Index'] = self.check_noindex() seo_result_2['Robots'] = self.check_robots_txt() seo_result_2['robots_url'] = self.get_robots() seo_result_2['Sitemaps'] = self.check_site_map() seo_result_2['Doctype'] = self.check_doctype() seo_result_2['No Follow'] = self.check_nofollow() seo_result_2['Flash'] = self.check_flash() seo_result_2['Keywords'] = self.keyword_results_data( self.keyword_results()) seo_result_2['Mobile Ready'] = self.check_mobile_ready( mobile_ready) scores = {'passed': 0, 'failed': 0, 'warning': 0} result = {} for key, value in seo_result_2.items(): try: if value['passed'] is True: scores['passed'] += 1 result[key] = {'passed': True} elif value['passed'] is False: scores['failed'] += 1 result[key] = {'passed': False} elif value['passed'] is None: scores['warning'] += 1 result[key] = {'passed': None} except: pass api_list = {'Speed': {}, 'Usability': {}} for key, value in page_speed['formattedResults'][ 'ruleResults'].items(): if key == 'MainResourceServerResponseTime': continue try: if value['urlBlocks']: scores['failed'] += 1 result[value['localizedRuleName']] = {'passed': False} else: scores['passed'] += 1 result[value['localizedRuleName']] = {'passed': True} except: scores['passed'] += 1 result[value['localizedRuleName']] = {'passed': True} if 'SPEED' in value['groups']: api_list['Speed'][key] = value elif 'USABILITY' in value['groups']: api_list['Usability'][key] = value result[value['localizedRuleName']]['priority'] = 'medium' seo_result['list'] = seo_result_2 #self.format_keyword_data(self.keyword_results()) seo_result['messages'] = json.dumps(self.modal_data) seo_result['images'] = json.dumps(self.modal_images) #seo_result['keywords'] = json.dumps(self.modal_keywords) seo_result['url'] = self.url seo_result['scores'] = json.dumps(scores) seo_result['result_set'] = json.dumps(result) priority_list = {'high': [], 'medium': [], 'low': []} for key, value in self.data.items(): result[key]['priority'] = value['priority'] for key, value in result.items(): if value['passed'] is False: priority_list[result[key]['priority']].append( '<a href="#" onclick="navigate(\'' + key + '\', 100); return false;" class="orangelink">' + html.escape(key) + '</a>') priority_list = { key: value if len(value) > 0 else ['All checks have been passed for this priority'] for key, value in priority_list.items() } seo_result['result_set'] = json.dumps(result) seo_result['result_s'] = result priority_list = { 'high': ', '.join(priority_list['high']), 'medium': ', '.join(priority_list['medium']), 'low': ', '.join(priority_list['low']) } seo_result['priority_list'] = priority_list omen = {'Common': {}, 'Advanced Seo': {}, 'Server': {}} for k, v in seo_result['list'].items(): if type(v).__name__ == 'dict': if v.get('category') == 'common': omen['Common'][k] = v elif v.get('category') == 'server': omen['Server'][k] = v elif v.get('category') == 'advanced_seo': omen['Advanced Seo'][k] = v seo_result['omen'] = omen seo_result['api_list'] = api_list seo_result['nav_list'] = dict(omen, **api_list).keys() seo_result['total'] = json.dumps(self.calculate_result(result)) return seo_result def calculate_result(self, data): scores = {'passed': 0, 'failed': 0} rates = {'high': 3, 'medium': 2, 'low': 1} for key, value in data.items(): if type(value).__name__ == 'dict': if value.get('passed') is True: scores['passed'] += rates[value['priority']] elif value.get('passed') is False: scores['failed'] += rates[value['priority']] return scores def check_mobile_ready(self, mobile): return { 'passed': mobile['ruleGroups']['USABILITY']['pass'], 'content': mobile['ruleGroups']['USABILITY']['pass'], 'length': 0, 'name': 'Mobile Ready', 'msg': self.format_message('Mobile Ready', mobile['ruleGroups']['USABILITY']['pass'], []), 'category': self.get_from_queryset('Mobile Ready', 'category'), 'priority': self.get_from_queryset('Mobile Ready', 'priority') } def check_title(self): title = '' if self.soup.find('title') is None else self.soup.find( 'title').string return { 'passed': True if (len(title) <= 70 and len(title) > 0) else False, 'content': title, 'length': len(title), 'name': 'title', 'msg': self.format_message( 'Title', True if (len(title) <= 70 and len(title) > 0) else False, [len(title)]), 'msg_data': '<p>' + title + '</p>' if len(title) > 0 else '', 'category': self.get_from_queryset('Title', 'category'), 'priority': self.get_from_queryset('Title', 'priority') } def check_description(self): description = '' if self.soup.find( 'meta', {'name': "description"}) is None else self.soup.find( 'meta', { 'name': "description" }).get('content') return { 'passed': True if (len(description) <= 160 and len(description) > 0) else False, 'content': description, 'length': len(description), 'name': 'description', 'msg': self.format_message( 'Description', True if (len(description) <= 160 and len(description) > 0) else False, [len(description)]), 'msg_data': '<p>' + description + '</p>' if len(description) > 0 else '', 'category': self.get_from_queryset('Description', 'category'), 'priority': self.get_from_queryset('Description', 'priority') } def check_headings_one(self): headings_one = [] if self.soup.findAll( 'h1') is None else self.soup.findAll('h1') return { 'passed': True if len(headings_one) == 1 else False, 'content': headings_one, 'length': len(headings_one), 'name': 'headings_one', 'msg': self.format_message('<h1> Headings', True if len(headings_one) == 1 else False, [len(headings_one)]), 'category': self.get_from_queryset('<h1> Headings', 'category'), 'priority': self.get_from_queryset('<h1> Headings', 'priority') } def check_headings_two(self): headings_two = [] if self.soup.findAll( 'h2') is None else self.soup.findAll('h2') return { 'passed': True if len(headings_two) > 0 else False, 'content': headings_two, 'length': len(headings_two), 'name': 'headings_two', 'msg': self.format_message('<h2> Headings', True if len(headings_two) > 0 else False, [len(headings_two)]), 'category': self.get_from_queryset('<h2> Headings', 'category'), 'priority': self.get_from_queryset('<h2> Headings', 'priority') } def check_images_alt(self): images = self.soup.findAll('img') no_alt = [] if (images is None): return { 'passed': True, 'content': None, 'length': 0, 'name': 'no_alt', 'msg': self.format_message('Image Alt Tag', True, [len(images), len(no_alt)]), 'category': self.get_from_queryset('Image Alt Tag', 'category'), 'priority': self.get_from_queryset('Image Alt Tag', 'priority') } else: for image in images: alternative = image.get('alt') #image['height'] = '50%' #image['width'] = '50%' if (alternative is None): #no_alt.append(str(self.check_url(image))) no_alt.append(str(image)) #no_alt.append(image) elif (alternative == ""): #no_alt.append(str(self.check_url(image))) no_alt.append(str(image)) #no_alt.append(image) else: pass self.modal_data['Image Alt Tag'] = json.dumps( [html.escape(el) for el in no_alt]) self.modal_images = no_alt return { 'passed': False if len(no_alt) > 0 else True, 'content': no_alt, 'length': len(no_alt), 'name': 'no_alt', 'msg': self.format_message('Image Alt Tag', False if len(no_alt) > 0 else True, [len(images), len(no_alt)]), 'category': self.get_from_queryset('Image Alt Tag', 'category'), 'priority': self.get_from_queryset('Image Alt Tag', 'priority'), 'json': json.dumps(no_alt), 'html': json.dumps([html.escape(el) for el in no_alt]) } def check_url(self, tag): url = tag.get('src') if not url.startswith(self.url): tag['src'] = self.url + url if ( url.startswith('/') and not self.url.endswith('/')) or ( not url.startswith('/') and self.url.endswith('/')) else self.url + '/' + url tag['src'].replace('///', '/') return tag def check_links(self): links = self.soup.findAll('a', href=True) linkz = [] for link in links: if link.get('href') != '' and link.get('href') != '#': linkz.append(str(link)) return linkz def check_broken_links(self, update=False): links = self.soup.findAll('a') broken_links = [] all_links = [] headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } if (links is None): return { 'passed': True, 'content': None, 'length': 0, 'name': 'broken_links', 'msg': self.format_message('Broken Links', True, [len(broken_links)]), 'category': self.get_from_queryset('Broken Links', 'category'), 'priority': self.get_from_queryset('Broken Links', 'priority') } else: for link in links: href = link.get('href') if (href is None or link.get('onclick') is not None): continue elif ('#' == href or 'javascript' in href or '@' in href): continue elif ('https://' not in href and 'http://' not in href): href = self.url + href try: all_links.append(href) request = requests.head(href, headers=headers) if request.status_code >= 400: request = requests.get(href, headers=headers) except: broken_links.append(str(link)) if request.status_code < 400: pass else: broken_links.append(str(link)) #if update: #self.modal_data['Broken Links'] = json.dumps([html.escape(el) for el in broken_links]) self.modal_data['Broken Links'] = json.dumps( [html.escape(el) for el in broken_links]) return { 'passed': False if len(broken_links) > 0 else True, 'content': broken_links, 'length': len(broken_links), 'name': 'broken_links', 'msg': self.format_message( 'Broken Links', False if len(broken_links) > 0 else True, [len(all_links), len(broken_links)]), 'html': json.dumps(broken_links), 'category': self.get_from_queryset('Broken Links', 'category'), 'priority': self.get_from_queryset('Broken Links', 'priority') } def url_exists(self, value): try: request = requests.head(value, allow_redirects=True) except ConnectionError: return False except MissingSchema: return False except: return False else: return True def check_script_async(self): scripts = self.soup.findAll('script') result = [] if scripts is None: return { 'passed': True, 'content': None, 'length': 0, 'name': 'script_async', 'msg': self.format_message('Script Async', True, [len(result)]) } else: for script in scripts: if script.get('async') and script.get( 'src') is None or script.get('src') is '': result.append(script) return { 'passed': False if len(result) > 0 else True, 'content': result, 'length': len(result), 'name': 'script_async', 'msg': self.format_message('Script Async', False if len(result) > 0 else True, [len(result)]) } def check_section(self): result = [] for counter in range(1, 7): headings = self.soup.findAll('h' + str(counter)) if headings is not None: for heading in headings: if heading.contents: if '<section' in str(heading.contents): result.append(heading) return result def check_section_headings(self): sections = self.soup.findAll('section') result = [] if sections is None: return result else: for section in sections: if section.contents: if ('<h2' not in str(section.contents) and '<h3' not in str(section.contents) and '<h3' not in str(section.contents) and '<h4' not in str(section.contents) and '<h5' not in str(section.contents) and '<h6' not in str(section.contents)): result.append(section) return result def check_urls(self): links = self.soup.findAll('a') bad_urls = [] if (links is None): return { 'passed': True, 'content': bad_urls, 'length': len(bad_urls), 'name': 'bad_urls', 'msg': self.format_message('Bad Urls', True, [len(bad_urls)]), 'category': self.get_from_queryset('Bad Urls', 'category'), 'priority': self.get_from_queryset('Bad Urls', 'priority') } else: for link in links: href = link.get('href') if (href is None or link.get('onclick') is not None): continue elif ('#' == href or 'javascript' in href or '@' in href): continue elif ('https://' not in href and 'http://' not in href): href = self.url + href if ('?' in href): if '_' in href.split('?')[0] or '=' in href.split('?')[0]: bad_urls.append(str(link)) else: if '_' in href or '=' in href: bad_urls.append(str(link)) self.modal_data['Bad Urls'] = json.dumps( [html.escape(el) for el in bad_urls]) return { 'passed': False if len(bad_urls) > 0 else True, 'content': bad_urls, 'length': len(bad_urls), 'name': 'bad_urls', 'msg': self.format_message('Bad Urls', False if len(bad_urls) > 0 else True, [len(bad_urls)]), 'html': json.dumps(bad_urls), 'category': self.get_from_queryset('Bad Urls', 'category'), 'priority': self.get_from_queryset('Bad Urls', 'priority') } def check_styles(self): elements = [] if self.soup.findAll() is None else self.soup.findAll() result = [] for element in elements: style = element.get('style') if (style is not None): result.append(str(element)) self.modal_data['Inline CSS'] = json.dumps( [html.escape(el) for el in result]) return { 'passed': False if len(result) > 0 else True, 'content': result, 'length': len(result), 'name': 'inline_styles', 'msg': self.format_message('Inline CSS', False if len(result) > 0 else True, [len(result)]), 'html': json.dumps([html.escape(el) for el in result]), 'json': json.dumps(result), 'category': self.get_from_queryset('Inline CSS', 'category'), 'priority': self.get_from_queryset('Inline CSS', 'priority') } def check_google_analytics(self): scripts = [] if self.soup.findAll( 'script') is None else self.soup.findAll('script') result = False for script in scripts: src = script.get('src') if (src is not None): if ('//www.google-analytics.com/analytics.js' in src): result = True break elif ('google-analytics.com' in script.string): result = True break return { 'passed': result, 'content': None, 'length': 0, 'name': 'google_analytics', 'msg': self.format_message('Google Analytics', result, [0]), 'category': self.get_from_queryset('Google Analytics', 'category'), 'priority': self.get_from_queryset('Google Analytics', 'priority') } def check_favicon(self): links = self.soup.findAll('link') result = False if (links is not None): for link in links: rel = link.get('rel') if (rel is not None): if ('icon' in rel): result = link break return { 'passed': True if result is not False else False, 'content': result, 'length': 0, 'name': 'favicon', 'msg': self.format_message('Favicon', True if result is not False else False, [result]), 'category': self.get_from_queryset('Favicon', 'category'), 'priority': self.get_from_queryset('Favicon', 'priority') } def check_robots(self, robots_url): try: from urllib.request import urlopen with urlopen(robots_url) as stream: result = stream.read().decode("utf-8") if (result): return result else: return False except: return False def check_robots_txt(self): robots_url = self.get_robots() if robots_url: try: from urllib.request import urlopen with urlopen(robots_url) as stream: # print(stream.read().decode("utf-8")) result = stream.read().decode("utf-8") except: result = False else: result = False return { 'passed': True if result else False, 'content': result, 'length': 1 if result else 0, 'name': 'robots', 'msg': self.format_message('Robots', True if result else False, []), 'msg_data': '<p><a href="' + robots_url + '">' + robots_url + '</a></p>' if result else '', 'category': self.get_from_queryset('Robots', 'category'), 'priority': self.get_from_queryset('Robots', 'priority') } def get_robots(self): return reppy.Robots.robots_url(self.url) def check_sitemap(self): temp = reppy.Robots.fetch(self.url) return list(temp.sitemaps) def check_site_map(self): sitemaps = [] str = self.check_robots(self.get_robots()) if str is not False: lines = str.split('\n') for line in lines: if 'Sitemap:' in line: temp = line.split(':', 1) if temp[1] not in sitemaps: sitemaps.append(temp[1]) result = '<p>' for sitemap in sitemaps: result += '<a href="' + sitemap.lower().strip( ) + '">' + sitemap.lower().strip() + '</a><br>' result += '</p>' else: result = '' return { 'passed': True if len(sitemaps) > 0 else False, 'content': sitemaps, 'length': len(sitemaps), 'name': 'sitemaps', 'msg': self.format_message('Sitemaps', True if len(sitemaps) > 0 else False, [len(sitemaps)]), 'msg_data': result if len(sitemaps) > 0 else '', 'category': self.get_from_queryset('Sitemaps', 'category'), 'priority': self.get_from_queryset('Sitemaps', 'priority') } def check_https(self): try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } requests.get(self.url, headers=headers) result = True except requests.exceptions.SSLError: result = False except: result = False return { 'passed': result, 'content': None, 'length': 0, 'name': 'Https', 'msg': self.format_message('HTTPS', result, [0]), 'category': self.get_from_queryset('HTTPS', 'category'), 'priority': self.get_from_queryset('HTTPS', 'priority') } def check_deprecated_elements(self, type): elements = self.soup.findAll(type) result = [] if (elements is None): return False else: for element in elements: result.append(element) return result def get_most_common_keyword_count(self): ''' r = requests.get(self.url) soup = BeautifulSoup(r.content) text = (''.join(s.findAll(text=True)) for s in soup)#.findAll(text=True)) c = Counter((x.rstrip(punctuation).lower() for y in text for x in y.split())) print(c.most_common()) print([x for x in c if c.get(x) > 5]) ''' html = urllib.request.urlopen(self.url).read() # print(self.text_from_html(html)) liist = re.findall(r'\b\w+', self.text_from_html(html)) lst = [x.lower() for x in liist] counter = Counter(lst) occs = [(word, count) for word, count in counter.items() if count >= 5 and self.is_valid_keyword(word)] occs.sort(key=lambda x: x[1]) #print(occs) return occs def tag_visible(self, element): if element.parent.name in [ 'style', 'script', 'head', 'title', 'meta', '[document]' ]: return False if isinstance(element, Comment): return False return True def text_from_html(self, body): soup = BeautifulSoup(body, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(self.tag_visible, texts) return u" ".join(t.strip() for t in visible_texts) def check_flash(self): result = True if '/.swf/S' in self.source_code or 'flashplayer' in self.source_code or 'https://get.adobe.com/flashplayer/' in self.source_code: result = False return { 'passed': result, 'content': None, 'length': 0, 'name': 'flash', 'msg': self.format_message('Flash', result, [0]), 'category': self.get_from_queryset('Flash', 'category'), 'priority': self.get_from_queryset('Flash', 'priority') } def check_nested_tables(self): tables = self.soup.findAll('table') nested_tables = [] for table in tables: if '<table' in table.contents: nested_tables.append(table) return { 'passed': False if len(nested_tables) > 0 else True, 'content': nested_tables, 'length': len(nested_tables), 'name': 'nested_tables', 'msg': self.format_message('Nested Tables', False if len(nested_tables) > 0 else True, [len(nested_tables)]), 'category': self.get_from_queryset('Nested Tables', 'category'), 'priority': self.get_from_queryset('Nested Tables', 'priority') } def check_frames(self): framesets = self.soup.findAll('frameset') frames = self.soup.findAll('frame') iframes = self.soup.findAll('iframe') return { 'passed': False if len(framesets) > 0 or len(frames) > 0 or len(iframes) > 0 else True, 'content': [framesets, frames], 'length': [len(framesets), len(frames), len(iframes)], 'name': 'Frameset', 'msg': self.format_message( 'Frameset', False if len(framesets) > 0 or len(frames) > 0 or len(iframes) > 0 else True, [len(framesets), len(frames), len(iframes)]), 'category': self.get_from_queryset('Frameset', 'category'), 'priority': self.get_from_queryset('Frameset', 'priority') } def check_canonical(self): canonical = self.soup.find('link', rel='canonical') return { 'passed': True if canonical else False, 'content': canonical, 'length': 1 if canonical else 0, 'name': 'canonical', 'msg': self.format_message( 'Canonical Tag', True if canonical else False, ['uses' if canonical else 'does not use', self.url]), 'category': self.get_from_queryset('Canonical Tag', 'category'), 'priority': self.get_from_queryset('Canonical Tag', 'priority') } def check_safe_browsing(self): ''' sbl = SafeBrowsingList('AIzaSyALJmgrWTmS3hvq3lxWzWnvo9FimxY-VAY') threat_list = sbl.lookup_url('http://github.com/') if threat_list == None: print("no threat") else: print('threats: ' + str(threat_list)) print("Type: ", type(threat_list)) return threat_list ''' safe_browsing_url = 'https://sb-ssl.google.com/safebrowsing/api/lookup?client=demo-app&key=AIzaSyALJmgrWTmS3hvq3lxWzWnvo9FimxY-VAY&appver=1.5.2&pver=3.1&url=http%3A%2F%2Fianfette.org%2F' safe_url = requests.get(safe_browsing_url) # .json() return safe_url def check_noindex(self): meta_tags = self.soup.findAll('meta') noindex_tags = [] if meta_tags and meta_tags is not None: for meta_tag in meta_tags: name = meta_tag.get('name') if name is not None: if 'robots' in name or 'googlebot' in name: if meta_tag.get('content') == 'noindex': noindex_tags.append(meta_tag) html_list = [] for link in noindex_tags: html_list.append(str(link)) self.modal_data['No Index'] = json.dumps( [html.escape(el) for el in html_list]) return { 'passed': None, 'content': noindex_tags, 'length': len(noindex_tags), 'name': 'no_index', 'msg': self.format_message('No Index', None, [ 'uses' if len(noindex_tags) > 0 else 'does not use', len(noindex_tags) ]), 'html': json.dumps(noindex_tags), 'category': self.get_from_queryset('No Index', 'category'), 'priority': self.get_from_queryset('No Index', 'priority') } def check_nofollow(self): links = self.soup.findAll('a', rel="nofollow") html_list = [] for link in links: html_list.append(str(link)) self.modal_data['No Follow'] = json.dumps( [html.escape(el) for el in html_list]) return { 'passed': None, 'content': links, 'length': len(links), 'name': 'no_follow', 'msg': self.format_message( 'No Follow', None, ['uses' if len(links) > 0 else 'does not use', len(links)]), 'html': json.dumps(html_list), 'category': self.get_from_queryset('No Follow', 'category'), 'priority': self.get_from_queryset('No Follow', 'priority') } def check_doctype(self): items = [ item for item in self.soup.contents if isinstance(item, bs4.Doctype) ] return { 'passed': True if items else False, 'content': str(items[0]) if items else None, 'length': 1 if items else 0, 'name': 'doctype', 'msg': self.format_message('Doctype', True if items else False, []), 'category': self.get_from_queryset('Doctype', 'category'), 'priority': self.get_from_queryset('Doctype', 'priority') } def check_media_queries(self): try: response = requests.get(self.url) soup = BeautifulSoup(response.content, 'lxml') css_links = [ link["href"] for link in soup.findAll("link") if "stylesheet" in link.get("rel", []) ] except Exception as e: pass pattern = re.compile(r'@media.+?\}') css_links = [] media_only = [] for url in css_links: try: response = requests.get(url).text media_only = pattern.findall(response) except Exception as e: media_only = [] except: media_only = [] return [css_links, media_only] def check_media(self): links = self.soup.findAll('link', rel="stylesheet") media_queries = [] for link in links: try: css = requests.get(link.get('href')).text except: continue if '@media ' in css or link.get('media') is not None: media_queries.append(link) return media_queries def safe_browsing(self): params = urllib.parse.urlencode({ 'client': 'api', 'apikey': 'AIzaSyALJmgrWTmS3hvq3lxWzWnvo9FimxY-VAY', 'appver': '1.5.2', 'pver': '3.1', 'url': self.url }) url = "https://sb-ssl.google.com/safebrowsing/api/lookup?%s" % params res = urlfetch.fetch(url, method=1) if res.status_code >= 400: raise Exception("Status: %s" % res.status_code) return res.status_code == 204 def get_category(self, name): try: cat = self.data[name]['category'] except: cat = None return cat def get_from_queryset(self, rule, name): try: data = self.data[rule][name] except: data = None return data def get_priority(self, name): try: priority = self.data[name]['priority'] except: priority = None return priority def get_message(self, name, status): try: msg = self.data[name][status]['message'] except: try: msg = self.data[name][None]['message'] except: msg = '' return msg def format_message(self, name, status, args): message = self.get_message(name, status) if len(message) > 0 and len(args) > 0: for arg in args: message = message.replace( '{{arg}}', arg if type(arg) is str else str(arg), 1) return message def keyword_results(self): results = self.get_most_common_keyword_count() keywords = {} for result in results: keywords[result[0]] = result[1] return keywords def keyword_results_data(self, keywords): if len(keywords) > 0: msg = '<ul class="list-group list-group-flush">' for key, value in keywords.items(): msg += '<li onclick="prepare_modal(\'Keywords\', \'' + key + '\')" class="list-group-item list-group-item-light" name="hover-item"><small>' + key + ' - ' + str( value) + ' times </small></li>' msg += '</ul>' else: msg = '' #self.modal_data['Keywords'] = json.dumps(self.format_keyword_data(keywords)) #self.modal_data['Keywords'] = json.dumps([html.escape(el) for el in no_alt]) return { 'passed': None, 'content': None, 'length': 0, 'name': 'keywords', 'msg': self.format_message('Keywords', None, []), #'html': json.dumps(self.format_keyword_data(keywords)), 'msg_data': msg, 'category': self.get_from_queryset('Keywords', 'category'), 'priority': self.get_from_queryset('Keywords', 'priority') } def is_valid_keyword(self, text): try: float(text) return False except: return True if len(text) > 1 else False def format_keyword_data(self, keywords): keyword_data = {} keywords_data = {} for key, value in keywords.items(): keyword_data[key] = {'data': self.keyword_data(key), 'freq': value} keywords_data[key] = self.keyword_data(key) self.modal_keywords = keywords_data return keyword_data def keyword_data(self, keyword, res='COUNTRY'): #pytrends = TrendReq(hl='en-US') self.pytrends.build_payload([keyword]) data = {} dat = {} data['interest_over_time'] = self.pytrends.interest_over_time( ).to_dict() for key, value in data['interest_over_time'].items(): dat[key] = {} for k, v in value.items(): dat[key][str(k.to_pydatetime())] = v data['interest_over_time'] = dat data['interest_by_region'] = self.pytrends.interest_by_region( resolution=res).to_dict() data['related_topics'] = self.pytrends.related_topics() data['related_queries'] = self.pytrends.related_queries() data['suggestions'] = self.pytrends.suggestions(keyword) data['related_topics'][keyword] = data['related_topics'][ keyword] if data['related_topics'][keyword] is None else data[ 'related_topics'][keyword].to_dict() data['related_queries'][keyword] = data['related_queries'][keyword] data['related_queries'][ keyword]['top'] = data['related_queries'][keyword]['top'].to_dict( ) if data['related_queries'][keyword]['top'] is not None else None data['related_queries'][keyword]['rising'] = data['related_queries'][ keyword]['rising'].to_dict() if data['related_queries'][keyword][ 'rising'] is not None else None return data
df.reset_index().plot(x='geoName', y='Chinese virus', figsize=(120, 10), kind='bar') # The table is very diaspora, we want to filter the data to contain only numbers above a treshold df = df[(df['Chinese virus'] >= 20)] df = df.sort_values(by=['Chinese virus'], ascending=False) # sort the data df.reset_index().plot(x='geoName', y='Chinese virus', figsize=(30, 10), kind='bar') #========================Daily Search Trends======================== # Get Google Hot Trends data df = pytrend.trending_searches(pn='united_states') df.head() df = pytrend.today_searches(pn='US') # For today's searches #===========================Related Queries========================== # Let us see what are the related queries for the topic "Coronavirus" # When you want to change the topic name just run the following code again with the new name as the parameter pytrend.build_payload(kw_list=['Chinese virus']) # Related Queries, returns a dictionary of dataframes related_queries = pytrend.related_queries() related_queries.values() # Related Topics, returns a dictionary of dataframes related_topic = pytrend.related_topics() related_topic.values()
for contory in contories: # Google Trends に 取得したいデータや期間などの情報を渡す pytrends.build_payload([contory], cat=0, timeframe='today 5-y', geo='JP', gprop='') # data1 = pytrends.related_queries() # data1[contory]['top'].to_csv('data/related_queries/top/' + contory + '.csv', encoding='utf_8_sig') # data1[contory]['rising'].to_csv('data/related_queries/raising/' + contory + '.csv', encoding='utf_8_sig') # data2 = pytrends.interest_over_time() # data2.to_csv('data/interest_over_time/' + contory + '.csv', encoding='utf_8_sig') data3 = pytrends.related_topics() data3[contory]['rising'].to_csv('data/related_topics/' + contory + '.csv', encoding='utf_8_sig') # data2 = pytrends.related_queries() # print(data2) # 取り出したいデータを指定して、実際に取り出す処理 # data = pytrends.related_queries() # data[contory]['top'].to_csv('data/related/Py_VS_R' + contory + '.csv', encoding='utf_8_sig') # 結果をコンソールに出力する # print(result1[contory]['top'].to_json) # for label, content in result1[contory]['top'].items(): # print('label:', label) # print('content:', content.to_json, sep='\n')
from pytrends.request import TrendReq pytrends = TrendReq(hl='ja-JP', tz=360) # Python3 code to iterate over a list kw_list = ["アメリカ"] # Using for loop # for keyword in kw_list: # print(keyword) pytrends.build_payload(kw_list, cat=0, timeframe='today 3-m', geo='JP', gprop='') # result = pytrends.interest_by_region(resolution='DMA', inc_low_vol=True, inc_geo_code=False) # result = pytrends.related_topics() date = '2020-04' df = pytrends.related_topics() print(df) # df.to_csv('data/' + 'keyword' + '.csv', encoding='utf_8') compression_opts = dict(method='zip', archive_name='out.csv') df.to_csv('out.zip', index=False, compression=compression_opts)
def main(): # Set up api wrapper pytrends = TrendReq(hl='en-US', tz=360) # Limit of 5 keywords kw_list = ["Steelcase"] # Build pipeline pytrends.build_payload(kw_list, cat=0, timeframe='all', geo='', gprop='') # Get overall interest over the entire timeline interestDF = pytrends.interest_over_time() interestDF.to_csv( "C:\\Users\\gwang\\Documents\\01 ADS Projects\\GoogleTrends5YearInterest_test.csv", index=True) print(interestDF.head()) print() # Sleep 60 prevents you from being rate limited # Get hourly interest over the time set hourlyDF = pytrends.get_historical_interest(kw_list, year_start=2019, month_start=7, day_start=1, hour_start=0, year_end=2019, month_end=7, day_end=1, hour_end=1, cat=0, geo='', gprop='', sleep=60) hourlyDF.to_csv( "C:\\Users\\gwang\\Documents\\01 ADS Projects\\GoogleTrends5YearHourlyInterest_test.csv", index=True) print(hourlyDF.head()) print() # Get regional interest across the world # Can switch to state or city specific regionDF = pytrends.interest_by_region(resolution='COUNTRY', inc_low_vol=True, inc_geo_code=False) regionDF.to_csv( "C:\\Users\\gwang\\Documents\\01 ADS Projects\\GoogleTrendsRegionInterest_test.csv", index=True) print(regionDF.head()) print() # Get rising related topics risingDF = pytrends.related_topics().get('Steelcase').get('rising') risingDF.to_csv( "C:\\Users\\gwang\\Documents\\01 ADS Projects\\GoogleTrendsRisingRelated_test.csv", index=True) # Get top related topics topDF = pytrends.related_topics().get('Steelcase').get('top') topDF.to_csv( "C:\\Users\\gwang\\Documents\\01 ADS Projects\\GoogleTrendsTopRelated_test.csv", index=True) print(risingDF.head()) print() print(topDF.head())
class Trendsetter(): def __init__(self, timezone=1, language='en-US'): """ Args: timezone: timezone in hours language: language of interface, not important """ self.tz = -60 * timezone self.countries = { 'united_states': ['US', 'en'], 'united_kingdom': ['GB', 'en'], 'australia': ['AU', 'en'], 'germany': ['DE', 'de'], 'france': ['FR', 'fr'], 'italy': ['IT', 'it'], 'japan': ['JP', 'ja'], 'saudi_arabia': ['SA', 'ar'], 'egypt': ['EG', 'ar'], # 'china': ['CN', 'zh-cn'], # 'iran': ['IR', 'ar'], 'brazil': ['BR', 'pt'], 'india': ['IN', 'hi'], 'israel': ['IL', 'iw'], # 'spain': ['ES', 'es'], 'mexico': ['MX', 'es'], 'russia': ['RU', 'ru'], 'south_korea': ['KR', 'ko'], 'taiwan': ['TW', 'zh-tw'], 'hong_kong': ['HK', 'zh-tw'], 'thailand': ['TH', 'th'], 'turkey': ['TR', 'tr'], 'vietnam': ['VN', 'vi'], } self.countrycodes = {v[0]: k for k, v in self.countries.items()} self.trends = TrendReq(hl=language, tz=self.tz) self.translator = gt.Translator(service_urls=[ "translate.google.com", "translate.google.co.kr", "translate.google.at", "translate.google.de", "translate.google.ru", "translate.google.ch", "translate.google.fr", "translate.google.es" ]) def browse_categories(self, levels=list()): """browse categories by list of index Args: levels: list, eg. [4,2] Returns: dataframe with child categories """ cat = self.trends.categories() for i in levels: cat = cat['children'][i] print(cat['name'], ", id =", cat['id']) if 'children' in cat.keys(): children = pd.DataFrame.from_dict(cat['children']) # children.index = children['id'] return children def get_trending(self, country='united_states'): """ get currently and daily trends for implemented countries Args: country: country name or country code Returns: {'trending': list, 'today': list} Raises: ValueError if country not supported """ if country not in self.countries: if country in self.countrycodes: country = self.countrycodes[country] else: raise ValueError("Country not supported.") self.trending = { 'trending': list(self.trends.trending_searches(pn=country)[0]), 'today': list(self.trends.today_searches(pn=self.countries[country][0])) } if self.countries[country][1] != 'en': try: self.trending_en = { k + '_en': list( map( lambda t: t.text, self.translator.translate( v, dest='en', src=self.countries[country][1]))) for k, v in self.trending.items() } self.trending.update(self.trending_en) except JSONDecodeError: warnings.warn("google translate API limit reached") except: warnings.warn("google translate API not working") return self.trending def get_related(self, kw, timeframe='now 7-d', category=0, location='', gtype=''): if isinstance(timeframe, list): tf_str = ' '.join(timeframe) else: tf_str = timeframe self.trends.build_payload([kw], cat=category, timeframe=tf_str, geo=location, gprop=gtype) related_topics = self.trends.related_topics()[kw] related_topics = related_topics['top'].append(related_topics['rising'], ignore_index=True, sort=False) return related_topics def get_interest(self, kwds, timeframe='now 7-d', category=0, location='', gtype=''): """ Args: kwds: list of up to 5 keywords timeframe: supported google format. or [t_start, t_end]; for daily output: 'YYYY-mm-dd', for hourly output: 'YYYY-mm-ddThh' category: location: supported google location or country code google_product: Returns: DataFrame """ if isinstance(kwds, str): kwds = [kwds] if isinstance(timeframe, list): tf_str = ' '.join(timeframe) else: tf_str = timeframe timeframe = timeframe.split(' ') if 'T' in tf_str: # hourly data format_str = '%Y-%m-%dT%H' else: # daily data format_str = '%Y-%m-%d' # needs improvement: if any(s in tf_str for s in ['now', 'today', 'all']): self.trends.build_payload(kwds, cat=category, timeframe=tf_str, geo=location, gprop=gtype) self.interest = self.trends.interest_over_time() else: t_start = datetime.datetime.strptime(timeframe[0], format_str) t_end = datetime.datetime.strptime(timeframe[1], format_str) if 'T' in tf_str and t_end - t_start >= datetime.timedelta(days=8): self.interest = self.trends.get_historical_interest( kwds, year_start=t_start.year, year_end=t_end.year, month_start=t_start.month, month_end=t_end.month, day_start=t_start.day, day_end=t_end.day, hour_start=t_start.hour, hour_end=t_end.hour, cat=category, geo=location, gprop=gtype, sleep=60) else: self.trends.build_payload(kwds, cat=category, timeframe=tf_str, geo=location, gprop=gtype) self.interest = self.trends.interest_over_time() return self.interest
def index(): if request.method == 'POST': keyword = request.form['name'] # 今日 today = date.today() plt.clf() # 30日前 day = today - timedelta(30) #print(day) dt_now = datetime.now() dt_now_s = str(dt_now.microsecond) pytrends = TrendReq(hl='ja-JP', tz=360) #keyword='' kw_list = [keyword] pytrends.build_payload(kw_list, cat=0, timeframe=str(day) + ' ' + str(today), geo='JP', gprop='') df = pytrends.interest_over_time() #時系列データを取り出す df.to_csv(dt_now_s + ".csv", encoding='cp932') #関連トピック df = pytrends.related_topics() #トップ try: text_ = df[keyword]['top'].loc[:, ['topic_title']].head(10) text__ = text_['topic_title'] _text = '\n・'.join(text__) text = _text.replace('Name: topic_title, dtype: object', '') except: text = 'なし' #上昇 try: text2_ = df[keyword]['rising'].loc[:, ['topic_title']].head(10) text2__ = text2_['topic_title'] _text2 = '\n・'.join(text2__) text2 = _text2.replace('Name: topic_title, dtype: object', '') except: text2 = 'なし' #関連キーワード df = pytrends.related_queries() #トップ try: text3_ = df[keyword]['top'].head(10) text3__ = text3_['query'] _text3 = '\n・'.join(text3__) text3 = _text3.replace('Name: query, dtype: object', '') except: text3 = 'なし' #上昇 try: text4_ = df[keyword]['rising'].head(10) text4__ = text4_['query'] _text4 = '\n・'.join(text4__) text4 = _text4.replace('Name: query, dtype: object', '') except: text4 = 'なし' #print(keyword+'.csv') df = pd.read_csv(dt_now_s + '.csv', encoding='cp932') ''' print(df) print(df.columns) print(df['date']) print(df[keyword]) ''' img = io.BytesIO() #グラフの作成 fig = plt.figure() plt.figure(1) plt.plot(df['date'], df[keyword], marker="o") #グラフの軸 plt.xlabel(df['date'].name) plt.ylabel(keyword) plt.savefig(img, format='png') img.seek(0) plot_url = base64.b64encode(img.getvalue()).decode() #グラフ表示 #plt.show() return render_template('choice.html', text=text, text2=text2, text3=text3, text4=text4, img="data:image/png;base64,{}".format(plot_url))
pytrend.build_payload(kw_list=['photography']) # # Interest Over Time # interest_over_time_df = pytrend.interest_over_time() # print(interest_over_time_df.head()) # # Interest by Region # interest_by_region_df = pytrend.interest_by_region() # print(interest_by_region_df.head()) # # Related Queries, returns a dictionary of dataframes # related_queries_dict = pytrend.related_queries() # print(related_queries_dict) # Related Topics, returns a dictionary of dataframes related_topic_dict = pytrend.related_topics() print(related_topic_dict) # # Get Google Hot Trends data # trending_searches_df = pytrend.trending_searches() # print(trending_searches_df.head()) # # Get Google Hot Trends data # today_searches_df = pytrend.today_searches() # print(today_searches_df.head()) # # Get Google Top Charts # top_charts_df = pytrend.top_charts(2018, hl='en-US', tz=300, geo='GLOBAL') # print(top_charts_df.head()) # # Get Google Keyword Suggestions
class GoogleTrend(DataCollector): # 구글 트렌드를 통해 정보를 가져오는 클래스 def __init__(self, keyword=['youtube'], hl='ko', tz='82', timeframe='today 5-y', cat=0, geo='KR', gprop=''): # 생성자 기본 설정 값 self.hl = hl self.tz = tz self.keyword = keyword self.timeframe = timeframe self.cat = cat self.geo = geo self.gprop = gprop self.update_pytrend() self.update_payload() # Login to Google. Only need to run this once, the rest of requests will use the same session. def update_pytrend(self): self.pytrend = TrendReq(hl=self.hl, tz=self.tz) # Create payload and capture API tokens. Only needed for interest_over_time(), interest_by_region() & related_queries() def update_payload(self): self.pytrend.build_payload(kw_list=self.keyword, cat=self.cat, timeframe=self.timeframe, geo=self.geo, gprop=self.gprop) def set_pytrend(self, hl='None', tz='None'): # hl는 host language, tz는 time zone if hl != 'None': # ex) 'ko', 'en_US' self.hl = hl if tz != 'None': # ex) 82:한국, 360:미국 self.tz = tz self.update_pytrend() self.update_payload() def set_payload(self, keyword=None, timeframe='None', cat=-1, geo='None', gprop='None'): # 키워드리스트, 타임프레임, 카테고리, 지역, 구글 프로퍼티 if keyword != None: self.keyword = keyword if timeframe != 'None': # ex) 'all', 'today 5-y', 'today 1,2,3-m', 'now 1,7-d', 'now 1,4-H', '2018-05-20 2019-01-20' self.timeframe = timeframe if cat != -1: self.cat = cat if geo != 'None': # ex) 'KR', 'US', '' self.geo = geo if gprop != 'None': # ex) 'images', 'news', 'youtube', 'froogle' self.gprop = gprop self.update_payload() def load_data(self, keyword=None): if keyword == 'region': self.interest_by_region() return self.interest_by_region_df_to_list() elif keyword == 'gender': return self.search_rate_by_gender() # Interest Over Time def interest_over_time(self): self.interest_over_time_df = self.pytrend.interest_over_time( ) # Returns pandas.Dataframe self.interest_over_time_df = self.interest_over_time_df.iloc[:, :self. keyword. __len__( )] # 안쓰는 데이터 isPartial 제거 self.interest_over_time_list = self.interest_over_time_df_to_list() return self.interest_over_time_list # Interest Over Time hourly def historical_hourly_interest(self): self.historical_hourly_interest_df = self.pytrend.get_historical_interest( keywords=self.keyword, year_start=2019, month_start=4, day_start=1, hour_start=0, year_end=2019, month_end=5, day_end=1, hour_end=0, cat=0, geo='KR', gprop='', sleep=0) # Returns pandas.Dataframe self.historical_hourly_interest_df = self.historical_hourly_interest_df.iloc[:, : self . keyword . __len__( )] # 안쓰는 데이터 isPartial 제거 self.historical_hourly_interest_list = self.historical_hourly_interest_df_to_list( ) return self.historical_hourly_interest_list # Interest by Region def interest_by_region(self): # 지역별로 검색 비율을 알려준다 self.interest_by_region_df = self.pytrend.interest_by_region() self.interest_by_region_list = self.interest_by_region_df_to_list() return self.interest_by_region_list # Related Topics, Returns dictionary of pandas.DataFrames def related_topics(self): # 키워드 관련 토픽을 순위별로 알려준다 self.related_topics_dict = self.pytrend.related_topics() return self.related_topics_dict # Related Queries, returns a dictionary of dataframes def related_queries(self): # 키워드 관련 검색어를 순위별로 알려준다 self.related_queries_dict = self.pytrend.related_queries() return self.related_queries_dict # trending searches in real time def trending_searches(self): # 현재 시간대 인기검색어 순위 20까지 보여준다 self.trending_searches_df = self.pytrend.trending_searches( pn='south_korea') return self.trending_searches_df # def today_searches(self): # self.today_searches_df = self.pytrend.today_searches() return self.today_searches_df # Get Google Top Charts def top_charts(self): # 년 단위로 상위 핫 키워드 가져오기 self.top_charts_df = self.pytrend.top_charts( date=2015, hl='ko', tz='82', geo='KR' ) # date = YYYY integer, tz='82', geo='KR', geo='GLOBAL', geo='US' return self.top_charts_df # Get Google Category def categories(self): # 구글 카테고리 종류와 id를 보여준다 self.categories_df = self.pytrend.categories() return self.categories_df def show_interest_over_time(self): # 시간에 따른 검색 비율을 그래프로 보여준다 num = 0.0 plt.figure(figsize=(14, 4)) plt.style.use('ggplot') # 더 이쁘게 그려준다 for key in self.keyword: num += 0.1 plt.plot(self.interest_over_time_df[key], c=plt.cm.rainbow(num), label=key) plt.legend(bbox_to_anchor=(1, 1), loc=2) # 라벨의 위치를 정해준다 plt.show() def interest_over_time_df_to_list( self): # interest_over_time_df의 데이터프레임 타입의 데이터를 리스트 타입으로 변환 date = self.interest_over_time_df.index.tolist() for i in range(len(date)): date[i] = date[i].date().strftime("%Y-%m-%d") date.insert(0, 'x') data = [] data.append(date) for key in self.keyword: y = self.interest_over_time_df[key].tolist() y.insert(0, key) data.append(y) return data def historical_hourly_interest_df_to_list( self ): # historical_hourly_interest_df의 데이터프레임 타입의 데이터를 리스트 타입으로 변환 date = self.historical_hourly_interest_df.index.tolist() for i in range(len(date)): date[i] = date[i].date().strftime("%Y-%m-%d") date.insert(0, 'x') data = [] data.append(date) for key in self.keyword: y = self.historical_hourly_interest_df[key].tolist() y.insert(0, key) data.append(y) return data def interest_by_region_df_to_list( self): # interest_by_region_df의 데이터프레임 타입의 데이터를 리스트 타입으로 변환 region = self.interest_by_region_df.index.tolist() data = [] for key in self.keyword: y = self.interest_by_region_df[key].tolist() ratio = 0 for i in [0, 1, 2, 3, 8, 11, 12, 13, 14, 15]: ratio += y[i] ratio /= 100 tmp_val = 0 reg_name = '' if ratio > 0: for i in range(len(region)): if i in [1, 2, 14, 11, 0, 13]: if i == 0: tmp_val = round(y[i] / ratio) reg_name = '강원도' elif i == 1: tmp_val = round((y[i] + y[i + 1]) / ratio) reg_name = '서울/경기' elif i == 2: tmp_val = round((y[i] + y[i + 1]) / ratio) reg_name = '경상도' elif i == 11: tmp_val = round((y[i] + y[i + 1]) / ratio) reg_name = '전라도' elif i == 13: tmp_val = round(y[i] / ratio) reg_name = '제주도' elif i == 14: tmp_val = round((y[i] + y[i + 1]) / ratio) reg_name = '충청도' data.append([reg_name, tmp_val]) return data def search_rate_by_gender(self): gender_data = [] gender_data.append(['male', random.randint(50, 100)]) gender_data.append(['female', random.randint(50, 100)]) return gender_data
class DesignerTrendsCollector(BuilderTrendsCollector): """DesignerTrendsCollector contains the specific implementation of `BuilderTrendsCollector`. `DesignerTrendsCollector` contains the specific implementation of `BuilderTrendsCollector` based on the external library `pytrends`. Args: BuilderTrendsCollector (class): Abstract class that provides the implementations of the properties and methods. """ def __init__( self, keyword_list: list, timeframe: str = "today 5-y", language: str = "en-US", category: int = 0, timezone: int = 360, country: str = "", property_filter="", **kwargs, ) -> None: """Initialization of DesignerTrendsCollector Args: keyword_list (list): Keyword-list with the items to search for. timeframe (str, optional): Time frame, respectively, period to search for. Defaults to "today 5-y". language (str, optional): Search language. Defaults to "en-US". category (int, optional): Define a specific [search category](https://github.com/pat310/google-trends-api/wiki/Google-Trends-Categories). Defaults to 0. timezone (int, optional): [Search timezone](https://developers.google.com/maps/documentation/timezone/overview). Defaults to 360. country (str, optional): The country, where to search for. Defaults to "". property_filter (str, optional): Property filer of the search; only in news, images, YouTube, shopping. Defaults to "". """ self.keyword_list = keyword_list self.timeframe = timeframe self.language = language self.category = category self.timezone = timezone self.country = country self.property_filter = property_filter self.pytrends = TrendReq(hl=self.language, tz=self.timezone, **kwargs) self.pytrends.build_payload( kw_list=self.keyword_list, cat=self.category, timeframe=self.timeframe, geo=self.country, gprop=self.property_filter, ) self.reset() def reset(self) -> None: """Reset the product to empty.""" self._product = TrendProduct() @property def trends(self) -> TrendProduct: """Return the trend results. Returns: TrendProduct: (class) TrendProduct contains the dictionary and the return value of it. """ product = self._product self.reset() return product def get_interest_over_time(self) -> None: """Request data from a interest over time search.""" self._product.add_product( key=self.get_interest_over_time, value=self.pytrends.interest_over_time(), ) def get_interest_by_region(self, resolution: str, **kwargs) -> None: """Request data from a interest by region search. Args: resolution (str): The resolution of the subregion. """ self._product.add_product( key=self.get_interest_by_region, value=self.pytrends.interest_by_region(resolution=resolution, **kwargs), ) def get_trending_searches(self, trend_country: str) -> None: """Request data from a search by country. Args: trend_country (str, optional): Name of the country of intrest. Defaults to "united_states". """ self._product.add_product( key=self.get_trending_searches, value=self.pytrends.trending_searches(pn=trend_country), ) def get_today_searches(self, today_country: str) -> None: """Request data from the daily search trends. Args: today_country (str): Name of the country of intrest. """ self._product.add_product( key=self.get_today_searches, value=self.pytrends.today_searches(pn=today_country), ) def get_top_charts(self, date: int, top_country: str) -> None: """Request data from a top charts search. Args: date (int): Year top_country (str): Name of the country of intrest. """ self._product.add_product( key=self.get_top_charts, value=self.pytrends.top_charts(date, hl=self.language, tz=self.timezone, geo=top_country), ) def get_related_topics(self) -> None: """Request data of a related topics based on the keyword.""" self._product.add_product(key=self.get_related_topics, value=self.pytrends.related_topics()) def get_related_queries(self) -> None: """Request data of a related queries based on the keyword.""" self._product.add_product( key=self.get_related_queries, value=self.pytrends.related_queries(), ) def get_suggestions(self) -> None: """Request data from keyword suggestion dropdown search.""" self._product.add_product( key=self.get_suggestions, value={ keyword: self.pytrends.suggestions(keyword=keyword) for keyword in self.keyword_list }, ) def get_categories(self) -> None: """Request available categories data for the current search.""" self._product.add_product( key=self.get_categories, value=self.pytrends.categories(), ) def get_historical_interest( self, year_start: int, month_start: int, day_start: int, hour_start: int, year_end: int, month_end: int, day_end: int, hour_end: int, **kwargs, ) -> None: """Request data from a hour-grided time search. Args: year_start (int): Starting year month_start (int): Starting month day_start (int): Starting day hour_start (int): Starting hour year_end (int): Final year month_end (int): Final month day_end (int): Final day hour_end (int): Final hour """ self._product.add_product( key=self.get_historical_interest, value=self.pytrends.get_historical_interest( keywords=self.keyword_list, year_start=year_start, month_start=month_start, day_start=day_start, hour_start=hour_start, year_end=year_end, month_end=month_end, day_end=day_end, hour_end=hour_end, cat=self.category, geo=self.country, gprop=self.property_filter, **kwargs, ), )
pytrends = TrendReq(hl='en-US', tz=360) key_words = ['Programming'] pytrends.build_payload(key_words) data_frame1 = pytrends.interest_over_time() print(data_frame1.head()) print('---------------') data_frame2 = pytrends.interest_by_region(resolution='COUNTRY') print(data_frame2.head()) print('---------------') data_frame3 = pytrends.related_topics() print(data_frame3) print('---------------') data_frame4 = pytrends.trending_searches(pn='ukraine') print(data_frame4.head()) print('---------------') data_frame5 = pytrends.top_charts(date='2020') print(data_frame5.head()) print('---------------') data_frame6 = pytrends.suggestions('UCU') print(data_frame6)
def test_related_topics(self): pytrend = TrendReq() pytrend.build_payload(kw_list=['pizza', 'bagel']) self.assertIsNotNone(pytrend.related_topics())
def GetRelatedTopics(request): pytrend = TrendReq() keyword = getSlotValue(request['intent'], 'KEYWORD') if(keyword != -1): try: pytrend.build_payload(kw_list=[keyword]) df = pytrend.related_topics()[keyword] except Exception as e: print(type(e)) print(e.args) ErrorMessages = [ "Google is not very helpful sometimes, this is so embarrasing.", "Sorry for the inconvenience.", "This is so embarrasing that I can't help.", "Something wrong happened!", "There was an error fetching the details.", "There was a problem retrieving the data.", "Nobody is perfect, atleast I tried.", "Sorry for such an absurd behaviour. I can't help.", "There was an error", "Error's happen. That's what happened right now.", "An error occurred!", "Keep calm, light a fire and try again.", "The operation couldn't be completed." ] temp = getRandom(ErrorMessages) return response_plain_text( temp, True, attributes, "Our apologies", "Sorry for the inconvenience", "What can I do for you?" ) else: topics_df = df.sort_values(by='value', ascending=False) topics = [] #random.shuffle(len(topics_df)) for i, row in topics_df.iterrows(): topics.append(row['title']) random.shuffle(topics) length = 5 outputSpeech, cardContent, extra = getOSandCC(topics, length, True) AnswerMessages = [ "Here are the topics related to ", "Here's what I have for you regarding ", "Here are some top topics related to ", "Here are the best topics related to ", "This is what I've got related to ", "Yes, I've got something for you regarding ", "Best topics in the category of " ] temp = getRandom(AnswerMessages) return response_plain_text( temp + keyword + " ... " + outputSpeech, True, {}, "Hot topics related to - " + keyword, cardContent, "Would you like to hear about " + extra ) else: return keywordRequired()
def test_related_topics(self): pytrend = TrendReq() pytrend.build_payload(kw_list=["pizza", "bagel"]) self.assertIsNotNone(pytrend.related_topics())
# Related Queries for 1 Hour rq_ = pytrends3.related_queries() q = 'query' v = 'value' for keys in rq_.keys(): df_rq_['Top Queries'] = rq_[keys]['top'][q] df_rq_['Top Query Value '] = rq_[keys]['top'][v] df_rq_['Rising Query'] = rq_[keys]['rising'][q] df_rq_['Rising Query Value'] = rq_[keys]['rising'][v] df_rq_.to_csv("Google_Trends/RelatedQueries1Day.csv", header=False, mode='w', index=False) # Related Topics over the last 1 Hour ts_ = pytrends3.related_topics() df_ts_ = pd.DataFrame(columns=['Title', 'Value', 'Mid']) for keys in ts_.keys(): df_ts_['Title'] = ts_[keys].title df_ts_['Value'] = ts_[keys].value df_ts_['Mid'] = ts_[keys].mid df_ts_.to_csv("Google_Trends/RelatedTopics1Day.csv", header=False, mode='w', index=False) counter += 1 print("Hour", counter) time.sleep(3600)
year_start=2022, month_start=1, day_start=1, hour_start=0, year_end=2022, month_end=2, day_end=10, hour_end=23, ) data # the keyword to extract data kw = "python" pt.build_payload([kw], timeframe="all") # get the interest by country ibr = pt.interest_by_region("COUNTRY", inc_low_vol=True, inc_geo_code=True) # sort the countries by interest ibr[kw].sort_values(ascending=False) # get related topics of the keyword rt = pt.related_topics() rt[kw]["top"] # get related queries to previous keyword rq = pt.related_queries() rq[kw]["top"] # get suggested searches pt.suggestions("python") # another example of suggested searches pt.suggestions("America") # trending searches per region ts = pt.trending_searches(pn="united_kingdom") ts
query_json = json.loads(line.rstrip()) keyword = query_json['keyword'] gt_queries = query_json['gt_queries'] # if current keyword exists in visited query, skip current keyword if len(visited_query) > 0 and keyword in visited_query: continue # get the topic id if no topic exists if 'topic_id' not in query_json: trends_crawler = TrendReq() trends_crawler.build_payload(keyword=gt_queries, timeframe=ALL_PERIOD, gprop=GPROP) related_topics_list = trends_crawler.related_topics() artist1, title1 = keyword.split(' - ', 1) artist1 = reformat(artist1) title1 = reformat(title1) topic_id = None # select the song mid with the highest relevant score from a list of related topics for topic_quad in related_topics_list: type = topic_quad['type'] value = topic_quad['value'] # return the first song mid if type.startswith('Song by'): if value > 40: topic_id = topic_quad['mid'] query_json.update(topic_quad)