def recursive_search(query_input, date=date.today()): global model if not model: model = embedding.EmbeddingModel() query = "site:{} {}".format(site, query_input) list_url = list(search_news(query, tld="co.in", num=15, stop=15, pause=2)) result = [] # {title: , url: , similarity: for url in list_url: news_date = extract_date(url) if news_date: if news_date < date: pagehead = extract_pagehead(url) keywords = simple_rake(pagehead) for keyword in keywords: similarity = model.phraseSimilarity( keyword[1], query_input) info_dic = {} info_dic['title'] = pagehead info_dic['url'] = url info_dic['similarity'] = similarity info_dic['keyword'] = keyword[1] info_dic['date'] = news_date result.append(info_dic) result = filter(lambda x: x['similarity'] < 0.99, result) result = sorted(result, key=lambda x: x['similarity'], reverse=True)[:3] new_query = "" for info in result: new_query += info['keyword'] new_query += " " return result, new_query
def google_search(query_input): query = "site:{} {}".format(site, query_input) list_url = list( search_news(query, tld="co.in", num=num_search_url, stop=num_search_url, pause=3)) current_date = date.today() info = {'results': []} print("extracting timeline.....") for url in list_url: news_date = extract_date(url) if news_date: if news_date < current_date: current_date = news_date pagehead = extract_pagehead(url) info['results'].append({ 'date': news_date, 'title': pagehead, 'url': url }) print(news_date) print(pagehead) print(url) print(simple_rake(pagehead)) print(" ") info['results'] = sorted(info['results'], key=lambda x: x['date'], reverse=False) info['results'].reverse() return info
def article_generator_text(keyword_query, num_articles): text = '' for url in search_news(str(keyword_query), num=1, stop=num_articles): article = NewsPlease.from_url(str(url)) if (article.text != None): if article.source_domain not in new_list: text = text + article.text return text
def google_news(term): tgn = googlesearch.search_news(term, domains=["www.theguardian.com"]) for url in tgn: print(url) yield extract_article(url) print() print("*" * 80) print()
def search(): list_of_results = [] query = str(txt_stock_search.get()).upper() + ' news' for j in googlesearch.search_news(query, stop=10, pause=2.0): list_of_results.append(j) counter = 1 for i in list_of_results: listbox.insert(counter, list_of_results[counter - 1]) counter += 1
def google_news(term): gns = googlesearch.search_news(term) for url in gns: print(url, end="\r") try: yield extract_article(url) print("Crowled! :)", url) except BaseException as e: print("NOT Crowled!", url) print(e) print("*"*80)
def get_results(query, lang='en', n_top=30): urls = googlesearch.search_news(query, num=30) output = [] c = 0 for url in urls: article = Article(url) article.download() start = url.find('www') end = url.find('/', start + 1) output.append(url[start:end]) c += 1 if c == n_top: break return output
def return_websearch_outputs(self, return_websearch_outputs, search_term): if return_websearch_outputs: query = search_term search_results_list = [] for result in search_news( str(query), # The query you want to run lang='en', # The language num=10, # Number of results per page start=0, # First result to retrieve stop=10, # Last result to retrieve pause=0.0001, # Lapse between HTTP requests ): search_results_list.append(result) self.Output['search_results_list'] = search_results_list else: pass
def scrape_news(): query = "Male Contraception" news = [] for j in search_news(query, tld='com', lang='en', num=10, start=0, stop=10, pause=2.0): news.append(j) # Get the news sites site1 = news[0] site2 = news[1] site3 = news[2] site4 = news[3] site5 = news[4] site6 = news[5] site7 = news[6] site8 = news[7] # Store data in a dictionary news_data = { "site1": site1, "site2": site2, "site3": site3, "site4": site4, "site5": site5, "site6": site6, "site7": site7, "site8": site8 } return render_template("newsscrape.html", recentNews=news_data)
def download(keyword, topdomain, num_articles, rest): # Empty newspaper dictionary to hold downloaded articles newsPaper = {"articles": []} for result in search_news(query=keyword, tld=topdomain, lang='en', num=10, stop=num_articles, pause=rest): try: print( "------------------------------------------------------------") print('Searching for: ', keyword) article = {} news = Article(result) news.download() news.parse() news.nlp() article['link'] = result article['title'] = news.title article['firm'] = keyword article['text'] = news.text article['keywords'] = news.keywords if news.publish_date: article['published'] = news.publish_date.isoformat() else: article['published'] = news.publish_date article['author'] = news.authors print(result) print(news.title) newsPaper['articles'].append(article) except: print(result + " \nError: could not be downloaded.") return newsPaper
def google_scrape(url): req = urllib.request.Request(url, headers={'User-Agent': "Magic Browser"}) con = urllib.request.urlopen(req) soup = BeautifulSoup(con, "html.parser") return soup.title.text i = 1 Table_list = [] table_names = pd.DataFrame() for query in columns: print(query) for url in search_news(query, tbs='qdr:h'): print(url) req = urllib.request.Request(url, headers={'User-Agent': "Magic Browser"}) con = urllib.request.urlopen(req) soup = BeautifulSoup(con, "html.parser") a = soup.title.text i += 1 Table_list.append(a) #print (i, a) #time.sleep(1) #print(Table_list) table_append = table_names.append(Table_list) table_append.to_csv("test.csv")
def acquisitions(): # Naive method -- news scraper downloads articles and compares title against acquisition matches list. acquisition_matches = ["buys", "buy", "bought", "acquires", "acquire", "acquisitions", "acquisition", "purchases", "purchase", "merger", "merge", "merges, ""merging", "invested", "invests", "invest", "secure"] # Lambda functions used to check if the news article contains any acqusition terms. contains_acquisition = lambda x: sub_contains(x) sub_contains = lambda y: all(y in s for s in acquisition_matches) # Ensure that there's an input path to download firms from. if not INPUT_FIRMS: print('No firms are loaded to run Acquisitions.') input("Press Enter to continue...") return # If neither of our fake news classifiers exist, create & store them if not path.exists(TFIDF_PKL) or not path.exists(NB_PKL): fake_news_classifier.create_model(TRAIN_PKL, TFIDF_PKL, NB_PKL) tfidf = fake_news_classifier.load_model(TFIDF_PKL) nb_body = fake_news_classifier.load_model(NB_PKL) firms = reader.read_firms(INPUT_FIRMS) all_news = {} all_news['firms'] = {} for firm in firms: newsPaper = { "articles": [] } print("------------------------------------------------------------") print("Searching for acuisitions: ", firm) for result in search_news(query=firm+" acquisition", tld="co.in", num=10, stop=5, pause=2): try: article = {} news = Article(result) news.download() news.parse() news.nlp() print("Downloaded: ", news.title) if any([x in acquisition_matches for x in news.title.lower().split()]): print("*"*10 + "Acquisition found." + "*"*10) article['link'] = result article['title'] = news.title article['text'] = news.text article['firm'] = firm article['keywords'] = news.keywords if news.publish_date: article['published'] = news.publish_date.isoformat() else: article['published'] = news.publish_date article['author'] = news.authors newsPaper['articles'].append(article) except Exception as e: print("Error: Article could not be downloaded.") if bool(newsPaper['articles']): newsPaper = news_scraper.mark_relevancy(newsPaper) all_news['firms'][firm] = newsPaper df_acq = json_handler.json_to_pd(all_news) if not df_acq.empty: df_acq = text_cleaner.clean(df_acq) # Removes some unnecessary columns. df_acq = df_acq.drop(columns = 'text') df_acq = df_acq.drop(columns = 'author') df_acq = fake_news_classifier.classify(tfidf, nb_body, df_acq) # Removes any rows in our df that are fake or do not have enough relevancy to the firm. print("Filtering dataframe for labels and relevancy.") df_acq = filters.filter_label(df_acq) df_acq = filters.filter_relevancy(df_acq) df_acq.to_excel(OUTPUT_XLSX) print("All relevant news articles downloaded to ", OUTPUT_XLSX) else: df_acq.to_excel(OUTPUT_XLSX) print("No articles found.") input("Press Enter to continue...")
def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0, stop=1, domains=None, pause=2.0, only_standard=False, extra_params={}, tpe='', user_agent=None, type='none', rights='', download=False, path=None, out_format="html"): if (type == 'text' or type == 'none' or type is None): # normal search if download: p = "downloads" if path is not None: p = path return Download( googlesearch.search(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params, tpe, user_agent), p, out_format) else: return googlesearch.search( query, tld, lang, tbs, safe, num, start, stop, pause, domains, # country #only_standard, extra_params, #tpe, user_agent #ssl ) ''' Return type: generator of str Returns: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. ''' elif (type == 'image_home'): # image search if download: p = "downloads" if path is not None: p = path return Download( googlesearch.search_images(query, tld, lang, tbs, safe, num, start, stop, pause, domains, only_standard, extra_params), p, out_format) else: return googlesearch.search_images(query, tld, lang, tbs, safe, num, start, stop, pause, domains, only_standard, extra_params) ''' Return type: generator of str Returns: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. ''' elif (type == 'image' or type == 'images'): response = google_images_download.googleimagesdownload( ) #class instantiation if lang == "sv": language = "Swedish" else: language = "English" arguments = { "keywords": query, "limit": num, "print_urls": "false", "language": language } # add rights if rights in [ "labeled-for-reuse-with-modifications", "labeled-for-reuse", "labeled-for-noncommercial-reuse-with-modification", "labeled-for-nocommercial-reuse" ]: arguments["usage_rights"] = rights # add safe if safe == "true": arguments["safe_search"] = safe if download: if path is not None: arguments["output_directory"] = path else: arguments["no_download"] = "true" arguments["no_directory"] = "true" ''' This one is a little special, Here we instead use google_images_download library Used to download images directly! ''' return response.download( arguments) #passing the arguments to the function elif (type == 'video' or type == 'film' or type == 'movie'): # video search if download: p = "downloads" if path is not None: p = path return Download( googlesearch.search_videos(query, tld, lang, tbs, safe, num, start, stop, pause, domains, only_standard, extra_params), p, out_format) else: return googlesearch.search_videos(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params) ''' Return type: generator of str Returns: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. ''' elif (type == 'news'): # search news if download: p = "downloads" if path is not None: p = path return Download( googlesearch.search_news(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params), p, out_format) else: return googlesearch.search_news(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params) ''' Return type: generator of str Returns: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. ''' elif (type == 'lucky'): # i am luchy search if download: p = "downloads" if path is not None: p = path return Download( googlesearch.lucky(query, tld, lang, tbs, safe, only_standard, extra_params, tpe), p, out_format) else: return googlesearch.lucky(query, tld, lang, tbs, safe, only_standard, extra_params, tpe) ''' Return type: str Returns: URL found by Google. ''' elif (type == 'shop'): # if download: p = "downloads" if path is not None: p = path return Download( googlesearch.search_shop(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params), p, out_format) else: return googlesearch.search_shop(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params) ''' Return type: generator of str Returns: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. ''' elif (type == 'app' or type == 'apps'): # search apps if download: p = "downloads" if path is not None: p = path return Download( googlesearch.search_apps(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params), p, out_format) else: return googlesearch.search_apps(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params) ''' Return type: generator of str Returns: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. ''' elif (type == 'books' or type == 'book'): # book search if download: p = "downloads" if path is not None: p = path return Download( googlesearch.search_books(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params), p, out_format) else: return googlesearch.search_books(query, tld, lang, tbs, safe, num, start, stop, domains, pause, only_standard, extra_params) ''' Return type: generator of str Returns: Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever. ''' else: raise Exception("Unsupported type as parameter to search function!")
#returns list of cleaned tokens article_tokens = [] cleaned_tokens = [] #uses nltk to tokenize article for article in article_list: tokens = word_tokenize(article) article_tokens.append(tokens) #removes punctuation and empty tokens for tokens in article_tokens: tokens_noPunc = [ ''.join(word for word in tok if word not in string.punctuation) for tok in tokens ] tokens_noPunc = [word for word in tokens_noPunc if word] cleaned_tokens.append(tokens_noPunc) return cleaned_tokens #use search news to get news urls from search query "bitcoin" from the last hour bitcoin_article_url = [] for url in search_news('bitcoin', tbs="qdr:h", stop=50): bitcoin_article_url.append(url) bitcoin_parsed = parse_url(bitcoin_article_url) bitcoin_tokens = tokenize_articles(bitcoin_parsed) bitcoin_clean_text = [' '.join(x) for x in bitcoin_tokens]
import pandas as pd from googlesearch import search_news df = pd.read_excel('Malware_name.xlsx') for i in range(1, 63): malware_name = df[df['family'] == i].values[:, 0] print(malware_name) num = len(malware_name) n = int(200 / num / 2) txt = str(i) + ".txt" f = open('../url/' + txt, "a") j = 0 # search blog for name in malware_name: search_name = "IoT Malware " + name #search_name = name for url in search_news(search_name, stop=n, tbs="nrt:b"): j += 1 print(j) f.write(url + '\n') # search news for name in malware_name: search_name = "IoT Malware " + name #search_name = name for url in search_news(search_name, stop=n): f.write(url + '\n') f.write(str(j)) f.close()
def __iter__(self): return (url for url in GS.search_news(self.query, pause=0.5) if not url.endswith('pdf'))
#not working as expected news search #direction-geolocation should b done elif 'google' in query: speak('Searching Google...') query = query.replace("google", "") if 'youtube video' in query: speak("Directing to Youtube.com") if 'mostlysane' in query: webbrowser.open( "https://www.youtube.com/channel/UCvCyIiKSCA1fHKSCOKJyjXA") if 'desivedesi cooking' in query: webbrowser.open( "https://www.youtube.com/channel/UCu0pMnPnjuyjQ8VsPPGg4EA") elif 'news' in query: for results in search_news(query, num=1, stop=1, pause=1): speak("Directing to") webbrowser.open(results) speak(results) elif 'direction' in query: if 'car' in query: webbrowser.open( "https://www.google.com/maps/dir/Pune,+Maharashtra//@18.5205085,73.8217241,13z/data=!3m1!4b1!4m9!4m8!1m5!1m1!1s0x3bc2bf2e67461101:0x828d43bf9d9ee343!2m2!1d73.8567437!2d18.5204303!1m0!3e0" ) elif 'bus' in query: webbrowser.open( "https://www.google.com/maps/dir/Pune,+Maharashtra//@18.5205085,73.8217241,13z/data=!3m1!4b1!4m9!4m8!1m5!1m1!1s0x3bc2bf2e67461101:0x828d43bf9d9ee343!2m2!1d73.8567437!2d18.5204303!1m0!3e3" ) elif 'cycling' in query: webbrowser.open( "https://www.google.com/maps/dir/Kaveri+College+of+Arts+Science+and+Commerce,+Pune,+Maharashtra//@18.5023602,73.7917972,13z/data=!3m1!4b1!4m9!4m8!1m5!1m1!1s0x3bc2bf8c470e2413:0x30f1c26c617b4c2f!2m2!1d73.8268168!2d18.502282!1m0!3e1"
def func_googletwitter(currency): ################################################################################################# ###Twitter Followers Count Start################################################################# ################################################################################################# [Ntwitter, Ngoogle, Alexa, Alexa2] = ['N/A','N/A','N/A','N/A'] try: response = requests.get('https://icobench.com/ico/'+currency) txt = response.text soup_c = BeautifulSoup(txt, 'html.parser') j = -1 for a in soup_c.find_all('a', href=True): if 'https://twitter.com/' in a['href']: j = j + 1 if j == 0: ICO_Twitter = a['href'].split("https://twitter.com/",1)[1] ICO_WebA = [] j = -1 for a in soup_c.find_all('a', href=True): if (('http') in a['href']) and ('?utm_source=icobench' in a['href']): j = j + 1 ICO_WebA.append(j) ICO_WebA[j] = a['href'][0:a['href'].index("?")] ICO_Twitter1 = ICO_Twitter ICO_Web1 = ICO_WebA[0] except: ICO_Twitter1 = 'N/A' ICO_Web1 = 'N/A' try: response = requests.get('https://icorating.com/ico/'+currency+'/#details',headers={'User-Agent': 'Mozilla/5.0'}) txt = response.text soup_c = BeautifulSoup(txt, 'html.parser') j = -1 ICO_Twitter22 = [] for a in soup_c.find_all('a', href=True): if 'https://twitter.com/' in a['href']: j = j + 1 ICO_Twitter22.append(j) ICO_Twitter22[j] = a['href'].split("https://twitter.com/",1)[1] ICO_Twitter2 = ICO_Twitter22[len(ICO_Twitter22)-2] ICO_WebB = [] j = -1 for a in soup_c.find_all('a', href=True): if (('http') in a['href']) and ('?utm_source=icorating' in a['href']): j = j + 1 ICO_WebB.append(j) ICO_WebB[j] = a['href'][0:a['href'].index("?")] ICO_Web2 = ICO_WebB[0] except: ICO_Twitter2 = 'N/A' ICO_Web2 = 'N/A' try: response = requests.get('https://icomarks.com/ico/'+currency,headers={'User-Agent': 'Mozilla/5.0'}) txt = response.text soup_c = BeautifulSoup(txt, 'html.parser') j = -1 ICO_Twitter33 = [] for a in soup_c.find_all('a', href=True): if 'https://twitter.com/' in a['href']: j = j + 1 ICO_Twitter33.append(j) ICO_Twitter33[j] = a['href'].split("https://twitter.com/",1)[1] ICO_Twitter3 = ICO_Twitter33[1] ICO_WebC = [] j = -1 for a in soup_c.find_all('a', href=True): if (('http') in a['href']) and ('?utm_source=icomarks' in a['href']): j = j + 1 ICO_WebC.append(j) ICO_WebC[j] = a['href'][0:a['href'].index("?")] ICO_Web3 = ICO_WebC[0] except: ICO_Twitter3 = 'N/A' ICO_Web3 = 'N/A' ICO_T = [ICO_Twitter1,ICO_Twitter2,ICO_Twitter3] ICO_Web = [ICO_Web1,ICO_Web2,ICO_Web3] ICO_T = [item for item in ICO_T if item not in ['N/A','icorating','ICO_marks','ICObench']] ICO_Web = [item for item in ICO_Web if item not in ['N/A','icorating','ICO_marks','ICObench']] if len(ICO_T) >= 1: ICO_Twitter = ICO_T[0] if len(ICO_T) == 0: ICO_Twitter = 'N/A' if len(ICO_Web) >= 1: ICO_Website = ICO_Web[0] if len(ICO_Web) == 0: ICO_Website = 'N/A' consumer_key = "fsee9ncfK3XqnTtQnCZt1aFq2" consumer_secret = "qmTxu9b26kSYCTkv2nsd6zvump4Ryesjr8mGRtkuja7bflLdpu" access_token = "3220981051-MM5xca27lheZTUI6q5lcESPbyJzIBLUHuv52Ap7" access_token_secret = "UkbSVaeK7oFspAeF9435VLDwbxiasYqB2CvZRjp9NhYeL" # Tweepy OAuthHandler auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) try: user = api.get_user(ICO_Twitter) ICO_Twitter_N = eval(str(user.followers_count)) except: ICO_Twitter_N = 'N/A' #------------------------------------------------------------------------------------------------ ###Twitter Followers Count END################################################################### #------------------------------------------------------------------------------------------------ ################################################################################################# ###Google News Articles Count Start############################################################## ################################################################################################# query = '"'+currency+' ICO'+'"' try: k_news = 0 for j in search_news(query, tld="com", lang="en", num=30, stop=1, pause=2, tbs='cdr:1,cd_min:1/1/2015,cd_max:'+dt.datetime.now().date().isoformat().replace('-',"/")): k_news = k_news + 1 except: k_news = 0 #------------------------------------------------------------------------------------------------ ###Google News Articles Count END################################################################ #------------------------------------------------------------------------------------------------ ################################################################################################# ###Alexa Rank Start############################################################################## ################################################################################################# #Warning: These are updated every 3 months Alexa_N = 'N/A' Alexa_N2 = 'N/A' try: response = requests.get('https://www.alexa.com/siteinfo/'+ICO_Website,headers={'User-Agent': 'Mozilla/5.0'}) txt = response.text soup_d = BeautifulSoup(txt, 'html.parser') value_d = soup_d.findAll("strong", {"class": "metrics-data align-vmiddle"}) j = -1 metric = [] for tag in value_d: j = j + 1 metric.append(j) metric[j] = tag.text if len(metric) == 5: daily_views = float(eval(metric[2].replace(" ","").replace("\n",""))) alexa_rank = float(eval(metric[0].replace(" ","").replace("\n","").replace(",",""))) bounce_rate = round(float(eval(metric[1].replace(" ","").replace("\n","").replace("%",""))/100.),2) #Percentage daily_time_min = float(eval(metric[3].replace(" ","").replace("\n","")[0:(metric[3].replace(" ","").replace("\n","")).index(":")])) daily_time_sec = metric[3].replace(" ","").replace("\n","")[metric[3].replace(" ","").replace("\n","").index(":")+1:len(metric[3].replace(" ","").replace("\n",""))] if daily_time_sec[0] == '0': daily_time_sec_1 = float(eval(daily_time_sec[1])) if daily_time_sec[0] != '0': daily_time_sec_1 = float(eval(daily_time_sec)) daily_time = daily_time_min * 60 + daily_time_sec_1 Alexa_N = daily_views Alexa_N2 = daily_time if len(metric) == 6: daily_views = float(eval(metric[3].replace(" ","").replace("\n",""))) alexa_rank = float(eval(metric[0].replace(" ","").replace("\n","").replace(",",""))) bounce_rate = round(float(eval(metric[2].replace(" ","").replace("\n","").replace("%",""))/100.),2) #Percentage daily_time_min = float(eval(metric[4].replace(" ","").replace("\n","")[0:(metric[4].replace(" ","").replace("\n","")).index(":")])) daily_time_sec = metric[4].replace(" ","").replace("\n","")[(metric[4].replace(" ","").replace("\n","")).index(":")+1:len(metric[4].replace(" ","").replace("\n",""))] if daily_time_sec[0] == '0': daily_time_sec_1 = float(eval(daily_time_sec[1])) if daily_time_sec[0] != '0': daily_time_sec_1 = float(eval(daily_time_sec)) daily_time = daily_time_min * 60 + daily_time_sec_1 Alexa_N = daily_views Alexa_N2 = daily_time except: Alexa_N = 'N/A' Alexa_N2 = 'N/A' #------------------------------------------------------------------------------------------------ ###Alexa Rank END################################################################################ #------------------------------------------------------------------------------------------------ Ntwitter = ICO_Twitter_N Ngoogle = k_news Alexa = Alexa_N Alexa2 = Alexa_N2 return 'Twitter/Google/Alexa',Ntwitter,Ngoogle,Alexa,Alexa2