コード例 #1
0
def get_term():
    if request.method == 'POST':
        tag = request.form['srch-term']
        tt = tag

        mg = MagicGoogle()
        lijst = []
        #tt = 'Donald Trump'
        search = str(tt+' language:english file:html')
        for url in mg.search_url(query=search):
            lijst.append(url)

        df = pd.DataFrame({'url': lijst})
        #print('parallel versionOzzy: ')
        dff = ((applyParallel(df.groupby(df.index), tmpFunc)))
        dff = dff.query('wordcount != "err" & reponsetime != "err" & highwot != "err" & yeararchive != "err"')
        #dff = dff[dff.wordcount != 'err' ]
        #dfeat = dff
        # dfeat =del dff['url']
        newX = dff.values
        # newX=np.delete(newX, [1, 3], axis=1)
        newX = np.delete(newX, [6], axis=1)
        # print(newX)
        #newX = newX[~np.isnan(newX).any(axis=1)]
        #newX = newX.as_matrix().astype(np.float)
        pickle_fname = 'pickle.model'
        pickle_model = pickle.load(open(pickle_fname, 'rb'))
        result = pickle_model.predict(newX)  # print (result)
        px2 = result.reshape((-1, 8))
        dffres = pd.DataFrame(
            {'OverallQuality': px2[:, 0], 'accuracy': px2[:, 1], 'completeness': px2[:, 2], 'neutrality': px2[:, 3],
             'relevance': px2[:, 4], 'trustworthiness': px2[:, 5], 'readability': px2[:, 6], 'precision': px2[:, 7]})

    return render_template('mp.html', dataframe=dff.to_html(index=False), res=dffres.to_html(index=False))
コード例 #2
0
def conasearch(engine='google', key='python', page_num=0, page_size=30):
    result = []
    if engine == 'google':
        PROXIES = [{
            'http': '192.168.1.159:1080',
            'https': '192.168.1.159:1080'
        }]
        mg = MagicGoogle(PROXIES)
        sleep = random.randint(2, 15)

        for i in mg.search_url(query=key,
                               num=page_size,
                               pause=sleep,
                               start=page_num * page_size):
            # print '[URL]', i
            result.append(i)
    elif engine == 'baidu':
        global proxies
        mb = MagicBaidu(proxies)
        for i in mb.search(query=key,
                           pn=page_size * (page_num - 1),
                           rn=page_size):
            print '[URL]', i['url'], ': ', '[TITLE]', i['title']
            result.append(i['url'])
    else:
        print 'invalid engine :', engine
        exit(1)
    q.put(result)
コード例 #3
0
    def __init__(self, logger):
        service_args = [
            '--proxy=162.105.146.128:65434',
            '--proxy-type=http',
            '--proxy-auth=lcq:lcq123',
        ]
        self.browser = webdriver.PhantomJS(service_args=service_args)

        self.magic_google_tool = MagicGoogle()

        self.logger = logger
コード例 #4
0
def get_term():
    if request.method == 'POST':
        try:
            tag = request.form['srch-term']
            tt = tag

            mg = MagicGoogle()
            lijst = []
            #tt = 'Donald Trump'
            search = str(tt + ' language:english file:html')
            for url in mg.search_url(query=search):
                lijst.append(url)

            df = pd.DataFrame({'url': lijst})
            #print('parallel versionOzzy: ')
            dff = ((applyParallel(df.groupby(df.index), tmpFunc)))
            dff = dff.query('words != "err" & latency != "err"')
            #dff = dff.query ('words != "err" & latency != "err" & reputationHigh != "err" & maturity != "err"')
            #twit = dff[['kurtosispolarity', 'meansentiment', 'sdpolarity', 'tweetcount', 'tweetrate', 'url']]
            dff = dff[[
                'readable', 'reputationHigh', 'reputationLow', 'pictures',
                'polarity', 'latency', 'subjectivity', 'words', 'maturity',
                'url'
            ]]
            newX = dff.values
            # newX=np.delete(newX, [1, 3], axis=1)
            newX = np.delete(newX, [9], axis=1)
            # print(newX)
            # newX = newX[~np.isnan(newX).any(axis=1)]
            # newX = newX.as_matrix().astype(np.float)
            # pickle_fname = 'pickle.model'
            # pickle_model = pickle.load (open (pickle_fname, 'rb'))
            result = pickle_model.predict(newX)  # print (result)
            px2 = result.reshape((-1, 8))
            dffres = pd.DataFrame({
                'complete': px2[:, 0],
                'accuracy': px2[:, 1],
                'precise': px2[:, 2],
                'readable': px2[:, 3],
                'relevant': px2[:, 4],
                'trustworthy': px2[:, 5],
                'overall': px2[:, 6],
                'neutral': px2[:, 7]
            })
            return render_template('mp.html',
                                   dataframe=dff.to_html(index=False),
                                   res=dffres.to_html(index=False))
        except:
            flash('We failed at finding matching URL Please try another Query',
                  'nameError')
            return redirect('/')
コード例 #5
0
ファイル: app.py プロジェクト: davideceolin/qupid
def get_term():
    if request.method == 'POST':
        tag = request.form['srch-term']
        tt = tag

        mg = MagicGoogle()
        lijst = []
        #tt = 'Donald Trump'
        search = str(tt + ' language:english file:html')
        for url in mg.search_url(query=search):
            lijst.append(url)

        df = pd.DataFrame({'url': lijst})
        dff = ((applyParallel(df.groupby(df.index), tmpFunc)))
        #dff = dff[dff.wordcount != 'err']
        newX = dff.values
        newX = np.delete(newX, [6], axis=1)
        pickle_fname = 'pickle.model'
        pickle_model = pickle.load(open(pickle_fname, 'rb'))
        try:
            result = pickle_model.predict(newX)
        except:
            result = np.array([0, 0, 0, 0, 0, 0, 0, 0])
        print(result)
        px2 = result.reshape((-1, 8))
        dffres = pd.DataFrame({
            'OverallQuality': px2[:, 0],
            'accuracy': px2[:, 1],
            'completeness': px2[:, 2],
            'neutrality': px2[:, 3],
            'relevance': px2[:, 4],
            'trustworthiness': px2[:, 5],
            'readability': px2[:, 6],
            'precision': px2[:, 7]
        })
        #print(dffres)
        print(dffres)
        for row in dffres:
            print(dffres[row])
        #dffres2 = {row:plotpie(dffres[row][7],dffres[row][1],dffres[row][2],dffres[row][3],dffres[row][4],dffres[row][6],dffres[row][5])  for row in dffres} #
        dffres2 = {'a': plotpie(1, 2, 3, 4, 5, 6, 7)}
        pd.set_option('display.max_colwidth', -1)
        #print(dffres3)
        #print(type(dffres2))
        #print(dffres3.dtypes)
        return render_template('mp.html',
                               dataframe=dff.to_html(index=False),
                               res=dffres2)
コード例 #6
0
def PDFUrlGeter(filename):
    '''
    通过Google查重filename文件,获得其url
    :return: url列表
    '''
    try:
        mg = MagicGoogle()
        urls = mg.search_url(query=filename + 'filetype:pdf')
        u = [i for i in urls]
        if u:
            return u
        time.sleep(random.randint(1, 5))
        urls = mg.search_url(query=filename + 'inurl:pdf')
        u = [i for i in urls]
        if u:
            return u
        time.sleep(random.randint(1, 5))
        urls = mg.search_url(query=filename)
        u = [i for i in urls if 'pdf' in i]
        return u
    except:
        errorlist.append(filename)
        return []
コード例 #7
0
class TestMagicGoogle(unittest.TestCase):
    """
    Test MagicGoogle class
    """
    def setUp(self):
        PROXIES = [{'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'}]
        self.mg = MagicGoogle(PROXIES)

    def tearDown(self):
        self.mg = None

    def test_search_url(self):
        sleep = random.randint(2, 15)
        result = list(self.mg.search_url(query='python', num=1, pause=sleep))
        self.assertEqual(result[0], 'https://www.python.org/',
                         'test search_url fail')
コード例 #8
0
class GoogleSearch:
    def __init__(self, logger):
        service_args = [
            '--proxy=162.105.146.128:65434',
            '--proxy-type=http',
            '--proxy-auth=lcq:lcq123',
        ]
        self.browser = webdriver.PhantomJS(service_args=service_args)

        self.magic_google_tool = MagicGoogle()

        self.logger = logger

    def __del__(self):
        # close the brower
        self.browser.close()

    def search(self, query):
        '''

        :param query: keywords filled in Google
        :return: url list of reuters
        '''
        url = self.magic_google_tool.get_url_from_query(query=query)
        url = url.replace('https', 'http')
        self.browser.get(url)
        self.logger.info(url)
        reuters_url = list()
        for element in self.browser.find_elements_by_class_name('r'):
            a = element.find_element_by_tag_name('a')
            href = a.get_attribute('href')
            self.logger.info(href)
            '''
            :href is like
            http://www.google.com.hk/url?
            q=http://www.reuters.com/article/us-htc-earnings-idUSBREA0403F20140105
            &sa=U&ved=0ahUKEwj_tLuOo6fXAhUQ1WMKHX_8CosQFggUMAA&usg=AOvVaw3bKHvO8BEtJDYQ3uMvsrSj
            '''
            href = href[href.find('q=') + 2 : href.find('&')]
            if 'reuters' in href:
                reuters_url.append(href)
        return reuters_url
コード例 #9
0
ファイル: test_pyQuary.py プロジェクト: iotwlw/MagicGoogle
import pprint

sys.path.append(os.path.dirname(os.path.dirname(__file__)))
from MagicGoogle import MagicGoogle

################################################
# """
# cd MagicGoogle
# python Examples/search_result.py
# """
#################################################

PROXIES = [{'http': 'http://127.0.0.1:1080', 'https': 'http://127.0.0.1:1080'}]

# Or MagicGoogle()
mg = MagicGoogle(PROXIES)

google2amazon_results = open(
    './Connectors Adapters-11 2018-04-25 00;21;03.html', 'r')
google2amazon_result = google2amazon_results.read()

pq_content = mg.pq_html(google2amazon_result)
for item in pq_content('div.g').items():
    result = {'title': item('h3.r>a').eq(0).text()}
    href = item('h3.r>a').eq(0).attr('href')
    rating = item('div.f.slp').eq(0).text()
    if href:
        url = mg.filter_link(href)
        result['url'] = url
    text = item('span.st').text()
    result['text'] = text
コード例 #10
0
 def setUp(self):
     PROXIES = [{
         'http': '192.168.1.159:1080',
         'https': '192.168.1.159:1080'
     }]
     self.mg = MagicGoogle(PROXIES)
コード例 #11
0
ファイル: main.py プロジェクト: pkwv2012/crawl_reuters
def Main(**kwargs):
    '''
    :param kwargs:
        start_date:
        end_date:
        keywords_file:
        output_dir:
    :return:
    '''
    start_date = kwargs[
        'start_date'] if 'start_date' in kwargs else '2017-01-01'
    end_date = kwargs['end_date'] if 'end_date' in kwargs else '2017-02-01'
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    InitLogging()

    keys_file = kwargs['keywords_file']
    key_list = GetKeyList(keys_file)
    assert len(key_list) > 0

    output_dir = kwargs['output_dir']
    mg = MagicGoogle(PROXIES)
    global logger
    searcher = GoogleSearch(logger)
    while start_date < end_date:
        """
        for key in key_list:
            q = 'www.reuters.com/article/{} {}'.format(
                start_date.strftime("%Y/%m/%d"),
                key)
            print(q)
            logger.info('info:date={}||key_word=\'{}\'\n'.format(
                start_date.strftime("%Y-%m-%d"),
                key.lower()))
            reuters_url = searcher.search(query=q)
            print(reuters_url)
            for url in reuters_url:
                if start_date.strftime('%Y%m%d') not in url:
                    continue;
                print(url)
                DownloadFromReuters(
                    os.path.join(output_dir, start_date.strftime('%Y_%m_%d')),
                    url)

            time.sleep(random.randint(60, 120))
        """
        url = 'https://www.reuters.com/resources/archive/us/{}.html'.format(
            start_date.strftime("%Y%m%d"))
        res = requests.get(url=url, proxies=PROXIES[0])
        soup = BeautifulSoup(res.text)
        ref_list = [
            h.a['href'] for h in soup.find_all("div", {'class': 'headlineMed'})
        ]
        for ref in ref_list:
            if 'videoStory' in ref:
                # filter video news
                continue
            pprint.pprint(ref)
            try:
                DownloadFromReuters(os.path.join(
                    output_dir, start_date.strftime("%Y_%m_%d")),
                                    url=ref)
            except requests.exceptions.ProxyError:
                logger.error("ProxyError||url={}".format(ref))
            # time.sleep(random.randint(1, 2))
        time.sleep(random.randint(20, 120))
        start_date += timedelta(days=1)
コード例 #12
0
ファイル: bingsearch.py プロジェクト: iotwlw/MagicGoogle
# -*- coding:utf-8 -*-
from urllib import quote_plus

import cchardet
import requests
from MagicGoogle import MagicGoogle
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
PROXIES = [{
    'http': 'http://152.204.130.86:3128',
    'https': 'https://152.204.130.86:3128',
}]

mg = MagicGoogle(PROXIES)

postfixStr = ' "Currently unavailable." site:www.amazon.co.uk'


def getfrombing():
    keywords = open('./keywords', 'r')
    for keyword in keywords:
        keyword = keyword.rstrip()

        word = keyword + postfixStr
        url = 'http://global.bing.com/search?q=' + quote_plus(
            word) + '&qs=bs&ajf=60&first=1'

        flag0 = 3

        for k in range(0, 100):
コード例 #13
0
    def get_google_result(query_text, lan='en', no=10):
        mg = MagicGoogle()
        search_results = mg.search(query_text, language=lan, num=no,)

        text_list = [entity['text'] for entity in search_results]
        return text_list
コード例 #14
0
 def setUp(self):
     PROXIES = [{'http': '127.0.0.1:1080', 'https': '127.0.0.1:1080'}]
     self.mg = MagicGoogle(PROXIES)
コード例 #15
0
ファイル: google_search.py プロジェクト: iotwlw/MagicGoogle
from MagicGoogle import MagicGoogle

################################################
# """
# cd MagicGoogle
# python Examples/search_result.py
# """
#################################################

PROXIES = [{
    'http': 'http://152.204.130.86:3128',
    'https': 'https://152.204.130.86:3128',
}]

# Or MagicGoogle()
mg = MagicGoogle()

logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("chardet").setLevel(logging.WARNING)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
LOGGER = logging.getLogger('google_search.py')


# 定义上下文管理器,连接后自动关闭连接
@contextlib.contextmanager
def mysql(host='127.0.0.1',
          port=3306,
          user='******',
コード例 #16
0
ファイル: google.py プロジェクト: mytry/enjoysearch
#!/usr/bin/env python3

from MagicGoogle import MagicGoogle
import pprint

mg = MagicGoogle()

#  Crawling the whole page
# results = mg.search_page(query='readmorejoy')
# for result in results:
#     print(result)

#pprint.pprint(result)

morejoys = mg.search(query='readmorejoy')
for result in morejoys:
    print(result)

# Crawling url
# for url in mg.search_url(query='python'):
#     pprint.pprint(url)
コード例 #17
0
# 順調に働くため、まず:pip install MagicGoogle
# 403エイラが起こえるから、getgoogle2.pyを使ってください。

import os
import sys
import time
import random
import pprint
from MagicGoogle import MagicGoogle

mg = MagicGoogle()

for i in range(1,5):
	for url in mg.search_url(query='"index of / - OneIndex"',start=i*100+1,num=100):
		pprint.pprint(url)
	time.sleep(random.randint(5, 10))


コード例 #18
0
def get_detail():
    if request.method == 'POST':
        try:
            tag = request.form['srch-term']
            tt = tag

            mg = MagicGoogle()
            lijst = []
            #tt = 'Donald Trump'
            search = str(tt + ' language:english file:html')
            for url in mg.search_url(query=search):
                lijst.append(url)

            df = pd.DataFrame({'url': lijst})
            #print('parallel versionOzzy: ')
            dff = ((applyParallel(df.groupby(df.index), tmpFunc)))
            dff = dff.query('words != "err" & latency != "err"')
            #dff.to_csv('tect.csv')
            documents = dff['text'].tolist()
            #documents.to_csv('t')
            dftekst = dff[['text', 'url']]
            #dftekst.to_csv('ozzy.csv')
            dftekst = dftekst.replace('\n', ' ', regex=True)
            #print(d)
            ddurl = dff['url'].tolist()
            #dfurl = dff[['url']].copy()
            #documents = dff['ur'].tolist ()
            #print(documents)
            vectorizer = TfidfVectorizer(stop_words='english')
            X = vectorizer.fit_transform(documents)

            true_k = 4
            model = KMeans(n_clusters=true_k,
                           init='k-means++',
                           max_iter=100,
                           n_init=1)
            model.fit(X)

            order_centroids = model.cluster_centers_.argsort()[:, ::-1]
            terms = vectorizer.get_feature_names()
            jk = []
            for i in range(true_k):
                top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
                j = (' '.join(top_ten_words))
                #print(j)
                jk.append(j)
                #print ("Cluster {}: {}".format (i, ' '.join (top_ten_words)))

            columns = ['clus', 'category']
            dfd = pd.DataFrame(jk)

            dfd.insert(0, 'clus', range(0, 0 + len(dfd)))
            dfd.columns = columns
            #print(dfd)
            se = pd.Series(model.labels_)
            dff = dff[[
                'readable', 'reputationHigh', 'reputationLow', 'pictures',
                'polarity', 'latency', 'subjectivity', 'words', 'maturity',
                'url'
            ]]
            newX = dff.values
            newX = np.delete(newX, [9], axis=1)
            result = pickle_model.predict(newX)
            #print('res')
            px2 = result.reshape((-1, 8))
            dffres = pd.DataFrame({
                'complete': px2[:, 0],
                'accuracy': px2[:, 1],
                'precise': px2[:, 2],
                'readable': px2[:, 3],
                'relevant': px2[:, 4],
                'trustworthy': px2[:, 5],
                'overall': px2[:, 6],
                'neutral': px2[:, 7],
                'url': ddurl,
                'clus': se.values
            })

            #dfs = [dff, dfda, dffres]
            dfs = [dff, dffres]
            dfk = functools.reduce(
                lambda left, right: pd.merge(left, right, on='url'), dfs)
            geheel = dfd.merge(dfk, on='clus', how='inner')
            #geheel.to_csv('ozkan.csv', index=False)
            #print('ozzy')(classes=["table-bordered", "table-striped", "table-hover"])
            return render_template('adet.html',
                                   dataframe=geheel.to_html(index=False))
            #return render_template ('adet.html', dataframe=dff.to_html (index=False, classes=["table table-sm"]))
        except:
            #print(e)
            flash('We failed at finding matching URL Please try another Query',
                  'nameError')
            return redirect('/')
コード例 #19
0
from MagicGoogle import MagicGoogle

################################################
# """
# cd MagicGoogle
# python Examples/search_result.py
# """
#################################################

PROXIES = [{
    'http': 'http://192.168.2.207:1080',
    'https': 'http://192.168.2.207:1080'
}]

# Or MagicGoogle()
mg = MagicGoogle(PROXIES)

# The first page of results
result = mg.search_page(query='python')
print(result)

time.sleep(random.randint(1, 3))

# Get url
for url in mg.search_url(query='python'):
    pprint.pprint(url)

time.sleep(random.randint(1, 3))

# Output
# 'https://www.python.org/'
コード例 #20
0
ファイル: multi.py プロジェクト: Qupid/qupid
    def get(self, Query):
        mg = MagicGoogle()
        urls = []
        search = str(Query + ' language:english file:html')
        print(search)
        for url in mg.search_url(query=search):
            urls.append(str(url))
        tel = len(urls)
        pool = ThreadPool(tel)
        result = pool.map(get_web_data, urls)
        df1 = pd.DataFrame(result)
        df1 = df1[df1['tekst'].notnull()]
        print(len(df1.index))
        tekst = df1.tekst.values.tolist()
        df1.drop(['tekst'], axis=1, inplace=True)
        #cat = df1['label'].values
        aantal = len(tekst)
        #print(aantal)
        #print(aantal)

        n_samples = 5000
        n_features = 2000
        n_components = aantal
        n_top_words = 5

        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=n_features,
                                        stop_words='english')
        tf = tf_vectorizer.fit_transform(tekst)
        tf_feature_names = tf_vectorizer.get_feature_names()
        lda = LatentDirichletAllocation(n_components=n_components,
                                        max_iter=30,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)

        lda.fit(tf)
        ozzy = []

        def print_top_words(model, feature_names, n_top_words):
            # ozzy = []
            for topic_idx, topic in enumerate(model.components_):
                oz = (" ".join([
                    feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]
                ]))
                ozzy.append(oz)

        print_top_words(lda, tf_feature_names, n_top_words)
        df1['topic'] = ozzy

        true_k = int(aantal * 0.3)
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=100,
                    n_init=1)
        km.fit(tf)
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        terms = tf_vectorizer.get_feature_names()
        jk = []
        for i in range(true_k):
            j = []
            jk.append(j)
            for ind in order_centroids[i, :7]:
                za = str(' %s' % terms[ind])
                j.append(za)

        cols = {'clusters': jk}
        df2 = pd.DataFrame.from_dict(cols)
        df2['clusters'] = df2['clusters'].astype(str).str.replace(
            r"[\[\]']", '')  # terms
        df2.insert(0, 'clusterid', range(0, 0 + len(df2)))

        labels = km.labels_
        df1['clusterid'] = labels
        dfs = pd.merge(df1, df2)
        dfs.drop(['clusterid'], axis=1, inplace=True)
        multiS = dfs.to_dict()
        return multiS
        # from sklearn.naive_bayes import MultinomialNB
        # # clf = MultinomialNB().fit(tf, cat)
        # from sklearn.externals import joblib
        # # joblib.dump(clf, 'filename.pkl')
        # clf = joblib.load('filename.pkl')
        # # ttf = tf
        # cats = clf.predict(tf)
        # # # acc=np.mean(predicted == cat)
        # cm = metrics.confusion_matrix(cat, predicted)
        # sim = cosine_similarity(tf)
        '''Fetch a multi given its Queryentifier'''
        api.abort(404)
コード例 #21
0
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
from MagicGoogle import MagicGoogle

################################################
# """
# cd MagicGoogle
# python Examples/google_search.py
# """
#################################################

# Local proxy server
PROXIES = [{'http': 'http://127.0.0.1:8118', 'https': 'http://127.0.0.1:8118'}]

# Or MagicGoogle()
mg = MagicGoogle(PROXIES)

# --------------------------------- Get {'title','url','text'} Code  --------------------------------------------  ##
for i in mg.search(query='python', num=1, language='en'):
    pprint.pprint(i)
time.sleep(random.randint(1, 5))
# --------------------------------- Get {'title','url','text'} Output  ------------------------------------------  ##
# Output
# {'text': 'The official home of the Python Programming Language.',
# 'title': 'Welcome to Python .org',
# 'url': 'https://www.python.org/'}

# --------------------------------- Get first page Code ---------------------------------------------------------  ##
for url in mg.search_url(query='python'):
    pprint.pprint(url)
time.sleep(random.randint(1, 5))