def get_term(): if request.method == 'POST': tag = request.form['srch-term'] tt = tag mg = MagicGoogle() lijst = [] #tt = 'Donald Trump' search = str(tt+' language:english file:html') for url in mg.search_url(query=search): lijst.append(url) df = pd.DataFrame({'url': lijst}) #print('parallel versionOzzy: ') dff = ((applyParallel(df.groupby(df.index), tmpFunc))) dff = dff.query('wordcount != "err" & reponsetime != "err" & highwot != "err" & yeararchive != "err"') #dff = dff[dff.wordcount != 'err' ] #dfeat = dff # dfeat =del dff['url'] newX = dff.values # newX=np.delete(newX, [1, 3], axis=1) newX = np.delete(newX, [6], axis=1) # print(newX) #newX = newX[~np.isnan(newX).any(axis=1)] #newX = newX.as_matrix().astype(np.float) pickle_fname = 'pickle.model' pickle_model = pickle.load(open(pickle_fname, 'rb')) result = pickle_model.predict(newX) # print (result) px2 = result.reshape((-1, 8)) dffres = pd.DataFrame( {'OverallQuality': px2[:, 0], 'accuracy': px2[:, 1], 'completeness': px2[:, 2], 'neutrality': px2[:, 3], 'relevance': px2[:, 4], 'trustworthiness': px2[:, 5], 'readability': px2[:, 6], 'precision': px2[:, 7]}) return render_template('mp.html', dataframe=dff.to_html(index=False), res=dffres.to_html(index=False))
def conasearch(engine='google', key='python', page_num=0, page_size=30): result = [] if engine == 'google': PROXIES = [{ 'http': '192.168.1.159:1080', 'https': '192.168.1.159:1080' }] mg = MagicGoogle(PROXIES) sleep = random.randint(2, 15) for i in mg.search_url(query=key, num=page_size, pause=sleep, start=page_num * page_size): # print '[URL]', i result.append(i) elif engine == 'baidu': global proxies mb = MagicBaidu(proxies) for i in mb.search(query=key, pn=page_size * (page_num - 1), rn=page_size): print '[URL]', i['url'], ': ', '[TITLE]', i['title'] result.append(i['url']) else: print 'invalid engine :', engine exit(1) q.put(result)
def __init__(self, logger): service_args = [ '--proxy=162.105.146.128:65434', '--proxy-type=http', '--proxy-auth=lcq:lcq123', ] self.browser = webdriver.PhantomJS(service_args=service_args) self.magic_google_tool = MagicGoogle() self.logger = logger
def get_term(): if request.method == 'POST': try: tag = request.form['srch-term'] tt = tag mg = MagicGoogle() lijst = [] #tt = 'Donald Trump' search = str(tt + ' language:english file:html') for url in mg.search_url(query=search): lijst.append(url) df = pd.DataFrame({'url': lijst}) #print('parallel versionOzzy: ') dff = ((applyParallel(df.groupby(df.index), tmpFunc))) dff = dff.query('words != "err" & latency != "err"') #dff = dff.query ('words != "err" & latency != "err" & reputationHigh != "err" & maturity != "err"') #twit = dff[['kurtosispolarity', 'meansentiment', 'sdpolarity', 'tweetcount', 'tweetrate', 'url']] dff = dff[[ 'readable', 'reputationHigh', 'reputationLow', 'pictures', 'polarity', 'latency', 'subjectivity', 'words', 'maturity', 'url' ]] newX = dff.values # newX=np.delete(newX, [1, 3], axis=1) newX = np.delete(newX, [9], axis=1) # print(newX) # newX = newX[~np.isnan(newX).any(axis=1)] # newX = newX.as_matrix().astype(np.float) # pickle_fname = 'pickle.model' # pickle_model = pickle.load (open (pickle_fname, 'rb')) result = pickle_model.predict(newX) # print (result) px2 = result.reshape((-1, 8)) dffres = pd.DataFrame({ 'complete': px2[:, 0], 'accuracy': px2[:, 1], 'precise': px2[:, 2], 'readable': px2[:, 3], 'relevant': px2[:, 4], 'trustworthy': px2[:, 5], 'overall': px2[:, 6], 'neutral': px2[:, 7] }) return render_template('mp.html', dataframe=dff.to_html(index=False), res=dffres.to_html(index=False)) except: flash('We failed at finding matching URL Please try another Query', 'nameError') return redirect('/')
def get_term(): if request.method == 'POST': tag = request.form['srch-term'] tt = tag mg = MagicGoogle() lijst = [] #tt = 'Donald Trump' search = str(tt + ' language:english file:html') for url in mg.search_url(query=search): lijst.append(url) df = pd.DataFrame({'url': lijst}) dff = ((applyParallel(df.groupby(df.index), tmpFunc))) #dff = dff[dff.wordcount != 'err'] newX = dff.values newX = np.delete(newX, [6], axis=1) pickle_fname = 'pickle.model' pickle_model = pickle.load(open(pickle_fname, 'rb')) try: result = pickle_model.predict(newX) except: result = np.array([0, 0, 0, 0, 0, 0, 0, 0]) print(result) px2 = result.reshape((-1, 8)) dffres = pd.DataFrame({ 'OverallQuality': px2[:, 0], 'accuracy': px2[:, 1], 'completeness': px2[:, 2], 'neutrality': px2[:, 3], 'relevance': px2[:, 4], 'trustworthiness': px2[:, 5], 'readability': px2[:, 6], 'precision': px2[:, 7] }) #print(dffres) print(dffres) for row in dffres: print(dffres[row]) #dffres2 = {row:plotpie(dffres[row][7],dffres[row][1],dffres[row][2],dffres[row][3],dffres[row][4],dffres[row][6],dffres[row][5]) for row in dffres} # dffres2 = {'a': plotpie(1, 2, 3, 4, 5, 6, 7)} pd.set_option('display.max_colwidth', -1) #print(dffres3) #print(type(dffres2)) #print(dffres3.dtypes) return render_template('mp.html', dataframe=dff.to_html(index=False), res=dffres2)
def PDFUrlGeter(filename): ''' 通过Google查重filename文件,获得其url :return: url列表 ''' try: mg = MagicGoogle() urls = mg.search_url(query=filename + 'filetype:pdf') u = [i for i in urls] if u: return u time.sleep(random.randint(1, 5)) urls = mg.search_url(query=filename + 'inurl:pdf') u = [i for i in urls] if u: return u time.sleep(random.randint(1, 5)) urls = mg.search_url(query=filename) u = [i for i in urls if 'pdf' in i] return u except: errorlist.append(filename) return []
class TestMagicGoogle(unittest.TestCase): """ Test MagicGoogle class """ def setUp(self): PROXIES = [{'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'}] self.mg = MagicGoogle(PROXIES) def tearDown(self): self.mg = None def test_search_url(self): sleep = random.randint(2, 15) result = list(self.mg.search_url(query='python', num=1, pause=sleep)) self.assertEqual(result[0], 'https://www.python.org/', 'test search_url fail')
class GoogleSearch: def __init__(self, logger): service_args = [ '--proxy=162.105.146.128:65434', '--proxy-type=http', '--proxy-auth=lcq:lcq123', ] self.browser = webdriver.PhantomJS(service_args=service_args) self.magic_google_tool = MagicGoogle() self.logger = logger def __del__(self): # close the brower self.browser.close() def search(self, query): ''' :param query: keywords filled in Google :return: url list of reuters ''' url = self.magic_google_tool.get_url_from_query(query=query) url = url.replace('https', 'http') self.browser.get(url) self.logger.info(url) reuters_url = list() for element in self.browser.find_elements_by_class_name('r'): a = element.find_element_by_tag_name('a') href = a.get_attribute('href') self.logger.info(href) ''' :href is like http://www.google.com.hk/url? q=http://www.reuters.com/article/us-htc-earnings-idUSBREA0403F20140105 &sa=U&ved=0ahUKEwj_tLuOo6fXAhUQ1WMKHX_8CosQFggUMAA&usg=AOvVaw3bKHvO8BEtJDYQ3uMvsrSj ''' href = href[href.find('q=') + 2 : href.find('&')] if 'reuters' in href: reuters_url.append(href) return reuters_url
import pprint sys.path.append(os.path.dirname(os.path.dirname(__file__))) from MagicGoogle import MagicGoogle ################################################ # """ # cd MagicGoogle # python Examples/search_result.py # """ ################################################# PROXIES = [{'http': 'http://127.0.0.1:1080', 'https': 'http://127.0.0.1:1080'}] # Or MagicGoogle() mg = MagicGoogle(PROXIES) google2amazon_results = open( './Connectors Adapters-11 2018-04-25 00;21;03.html', 'r') google2amazon_result = google2amazon_results.read() pq_content = mg.pq_html(google2amazon_result) for item in pq_content('div.g').items(): result = {'title': item('h3.r>a').eq(0).text()} href = item('h3.r>a').eq(0).attr('href') rating = item('div.f.slp').eq(0).text() if href: url = mg.filter_link(href) result['url'] = url text = item('span.st').text() result['text'] = text
def setUp(self): PROXIES = [{ 'http': '192.168.1.159:1080', 'https': '192.168.1.159:1080' }] self.mg = MagicGoogle(PROXIES)
def Main(**kwargs): ''' :param kwargs: start_date: end_date: keywords_file: output_dir: :return: ''' start_date = kwargs[ 'start_date'] if 'start_date' in kwargs else '2017-01-01' end_date = kwargs['end_date'] if 'end_date' in kwargs else '2017-02-01' start_date = datetime.strptime(start_date, "%Y-%m-%d") end_date = datetime.strptime(end_date, "%Y-%m-%d") InitLogging() keys_file = kwargs['keywords_file'] key_list = GetKeyList(keys_file) assert len(key_list) > 0 output_dir = kwargs['output_dir'] mg = MagicGoogle(PROXIES) global logger searcher = GoogleSearch(logger) while start_date < end_date: """ for key in key_list: q = 'www.reuters.com/article/{} {}'.format( start_date.strftime("%Y/%m/%d"), key) print(q) logger.info('info:date={}||key_word=\'{}\'\n'.format( start_date.strftime("%Y-%m-%d"), key.lower())) reuters_url = searcher.search(query=q) print(reuters_url) for url in reuters_url: if start_date.strftime('%Y%m%d') not in url: continue; print(url) DownloadFromReuters( os.path.join(output_dir, start_date.strftime('%Y_%m_%d')), url) time.sleep(random.randint(60, 120)) """ url = 'https://www.reuters.com/resources/archive/us/{}.html'.format( start_date.strftime("%Y%m%d")) res = requests.get(url=url, proxies=PROXIES[0]) soup = BeautifulSoup(res.text) ref_list = [ h.a['href'] for h in soup.find_all("div", {'class': 'headlineMed'}) ] for ref in ref_list: if 'videoStory' in ref: # filter video news continue pprint.pprint(ref) try: DownloadFromReuters(os.path.join( output_dir, start_date.strftime("%Y_%m_%d")), url=ref) except requests.exceptions.ProxyError: logger.error("ProxyError||url={}".format(ref)) # time.sleep(random.randint(1, 2)) time.sleep(random.randint(20, 120)) start_date += timedelta(days=1)
# -*- coding:utf-8 -*- from urllib import quote_plus import cchardet import requests from MagicGoogle import MagicGoogle import sys reload(sys) sys.setdefaultencoding("utf-8") PROXIES = [{ 'http': 'http://152.204.130.86:3128', 'https': 'https://152.204.130.86:3128', }] mg = MagicGoogle(PROXIES) postfixStr = ' "Currently unavailable." site:www.amazon.co.uk' def getfrombing(): keywords = open('./keywords', 'r') for keyword in keywords: keyword = keyword.rstrip() word = keyword + postfixStr url = 'http://global.bing.com/search?q=' + quote_plus( word) + '&qs=bs&ajf=60&first=1' flag0 = 3 for k in range(0, 100):
def get_google_result(query_text, lan='en', no=10): mg = MagicGoogle() search_results = mg.search(query_text, language=lan, num=no,) text_list = [entity['text'] for entity in search_results] return text_list
def setUp(self): PROXIES = [{'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'}] self.mg = MagicGoogle(PROXIES)
from MagicGoogle import MagicGoogle ################################################ # """ # cd MagicGoogle # python Examples/search_result.py # """ ################################################# PROXIES = [{ 'http': 'http://152.204.130.86:3128', 'https': 'https://152.204.130.86:3128', }] # Or MagicGoogle() mg = MagicGoogle() logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("chardet").setLevel(logging.WARNING) logging.getLogger("requests").setLevel(logging.WARNING) logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') LOGGER = logging.getLogger('google_search.py') # 定义上下文管理器,连接后自动关闭连接 @contextlib.contextmanager def mysql(host='127.0.0.1', port=3306, user='******',
#!/usr/bin/env python3 from MagicGoogle import MagicGoogle import pprint mg = MagicGoogle() # Crawling the whole page # results = mg.search_page(query='readmorejoy') # for result in results: # print(result) #pprint.pprint(result) morejoys = mg.search(query='readmorejoy') for result in morejoys: print(result) # Crawling url # for url in mg.search_url(query='python'): # pprint.pprint(url)
# 順調に働くため、まず:pip install MagicGoogle # 403エイラが起こえるから、getgoogle2.pyを使ってください。 import os import sys import time import random import pprint from MagicGoogle import MagicGoogle mg = MagicGoogle() for i in range(1,5): for url in mg.search_url(query='"index of / - OneIndex"',start=i*100+1,num=100): pprint.pprint(url) time.sleep(random.randint(5, 10))
def get_detail(): if request.method == 'POST': try: tag = request.form['srch-term'] tt = tag mg = MagicGoogle() lijst = [] #tt = 'Donald Trump' search = str(tt + ' language:english file:html') for url in mg.search_url(query=search): lijst.append(url) df = pd.DataFrame({'url': lijst}) #print('parallel versionOzzy: ') dff = ((applyParallel(df.groupby(df.index), tmpFunc))) dff = dff.query('words != "err" & latency != "err"') #dff.to_csv('tect.csv') documents = dff['text'].tolist() #documents.to_csv('t') dftekst = dff[['text', 'url']] #dftekst.to_csv('ozzy.csv') dftekst = dftekst.replace('\n', ' ', regex=True) #print(d) ddurl = dff['url'].tolist() #dfurl = dff[['url']].copy() #documents = dff['ur'].tolist () #print(documents) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(documents) true_k = 4 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(X) order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() jk = [] for i in range(true_k): top_ten_words = [terms[ind] for ind in order_centroids[i, :10]] j = (' '.join(top_ten_words)) #print(j) jk.append(j) #print ("Cluster {}: {}".format (i, ' '.join (top_ten_words))) columns = ['clus', 'category'] dfd = pd.DataFrame(jk) dfd.insert(0, 'clus', range(0, 0 + len(dfd))) dfd.columns = columns #print(dfd) se = pd.Series(model.labels_) dff = dff[[ 'readable', 'reputationHigh', 'reputationLow', 'pictures', 'polarity', 'latency', 'subjectivity', 'words', 'maturity', 'url' ]] newX = dff.values newX = np.delete(newX, [9], axis=1) result = pickle_model.predict(newX) #print('res') px2 = result.reshape((-1, 8)) dffres = pd.DataFrame({ 'complete': px2[:, 0], 'accuracy': px2[:, 1], 'precise': px2[:, 2], 'readable': px2[:, 3], 'relevant': px2[:, 4], 'trustworthy': px2[:, 5], 'overall': px2[:, 6], 'neutral': px2[:, 7], 'url': ddurl, 'clus': se.values }) #dfs = [dff, dfda, dffres] dfs = [dff, dffres] dfk = functools.reduce( lambda left, right: pd.merge(left, right, on='url'), dfs) geheel = dfd.merge(dfk, on='clus', how='inner') #geheel.to_csv('ozkan.csv', index=False) #print('ozzy')(classes=["table-bordered", "table-striped", "table-hover"]) return render_template('adet.html', dataframe=geheel.to_html(index=False)) #return render_template ('adet.html', dataframe=dff.to_html (index=False, classes=["table table-sm"])) except: #print(e) flash('We failed at finding matching URL Please try another Query', 'nameError') return redirect('/')
from MagicGoogle import MagicGoogle ################################################ # """ # cd MagicGoogle # python Examples/search_result.py # """ ################################################# PROXIES = [{ 'http': 'http://192.168.2.207:1080', 'https': 'http://192.168.2.207:1080' }] # Or MagicGoogle() mg = MagicGoogle(PROXIES) # The first page of results result = mg.search_page(query='python') print(result) time.sleep(random.randint(1, 3)) # Get url for url in mg.search_url(query='python'): pprint.pprint(url) time.sleep(random.randint(1, 3)) # Output # 'https://www.python.org/'
def get(self, Query): mg = MagicGoogle() urls = [] search = str(Query + ' language:english file:html') print(search) for url in mg.search_url(query=search): urls.append(str(url)) tel = len(urls) pool = ThreadPool(tel) result = pool.map(get_web_data, urls) df1 = pd.DataFrame(result) df1 = df1[df1['tekst'].notnull()] print(len(df1.index)) tekst = df1.tekst.values.tolist() df1.drop(['tekst'], axis=1, inplace=True) #cat = df1['label'].values aantal = len(tekst) #print(aantal) #print(aantal) n_samples = 5000 n_features = 2000 n_components = aantal n_top_words = 5 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(tekst) tf_feature_names = tf_vectorizer.get_feature_names() lda = LatentDirichletAllocation(n_components=n_components, max_iter=30, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) ozzy = [] def print_top_words(model, feature_names, n_top_words): # ozzy = [] for topic_idx, topic in enumerate(model.components_): oz = (" ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ])) ozzy.append(oz) print_top_words(lda, tf_feature_names, n_top_words) df1['topic'] = ozzy true_k = int(aantal * 0.3) km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) km.fit(tf) order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = tf_vectorizer.get_feature_names() jk = [] for i in range(true_k): j = [] jk.append(j) for ind in order_centroids[i, :7]: za = str(' %s' % terms[ind]) j.append(za) cols = {'clusters': jk} df2 = pd.DataFrame.from_dict(cols) df2['clusters'] = df2['clusters'].astype(str).str.replace( r"[\[\]']", '') # terms df2.insert(0, 'clusterid', range(0, 0 + len(df2))) labels = km.labels_ df1['clusterid'] = labels dfs = pd.merge(df1, df2) dfs.drop(['clusterid'], axis=1, inplace=True) multiS = dfs.to_dict() return multiS # from sklearn.naive_bayes import MultinomialNB # # clf = MultinomialNB().fit(tf, cat) # from sklearn.externals import joblib # # joblib.dump(clf, 'filename.pkl') # clf = joblib.load('filename.pkl') # # ttf = tf # cats = clf.predict(tf) # # # acc=np.mean(predicted == cat) # cm = metrics.confusion_matrix(cat, predicted) # sim = cosine_similarity(tf) '''Fetch a multi given its Queryentifier''' api.abort(404)
sys.path.append(os.path.dirname(os.path.dirname(__file__))) from MagicGoogle import MagicGoogle ################################################ # """ # cd MagicGoogle # python Examples/google_search.py # """ ################################################# # Local proxy server PROXIES = [{'http': 'http://127.0.0.1:8118', 'https': 'http://127.0.0.1:8118'}] # Or MagicGoogle() mg = MagicGoogle(PROXIES) # --------------------------------- Get {'title','url','text'} Code -------------------------------------------- ## for i in mg.search(query='python', num=1, language='en'): pprint.pprint(i) time.sleep(random.randint(1, 5)) # --------------------------------- Get {'title','url','text'} Output ------------------------------------------ ## # Output # {'text': 'The official home of the Python Programming Language.', # 'title': 'Welcome to Python .org', # 'url': 'https://www.python.org/'} # --------------------------------- Get first page Code --------------------------------------------------------- ## for url in mg.search_url(query='python'): pprint.pprint(url) time.sleep(random.randint(1, 5))