Example #1
0
def main(spider_class, settings_module):
    crawler = Crawler(spider_class, settings_module)
    try:
        joinall([spawn(crawler.process_url) for i in xrange(5)])
    except KeyboardInterrupt:
        crawler.clear(False)
    except:
        logger.exception("Unable to complete")
    else:
        crawler.clear(True)
        logger.info("Crawling completed")
def test_whole():
    extractor = OpinionExtractor('word2vec/clean.partial.6pos.w3.m10.model')
    clusterer = Clusterer('word2vec/clean.partial.6pos.w3.m10.model')
    parser = MorphAnalyzer()
    crawler = Crawler(100)


    def extract_property(reviews):
        props = []

        for r in reviews:
            props += extractor.find_props(r)

        props = list(set(props))
        aprop, visual = clusterer.cluster_props(props)
        cdict = clusterer.cluster_as_dict(aprop, visual)
        centroids = clusterer.get_centroid(cdict)

        t = []

        for x in centroids.keys():
            t.append((centroids[x],\
                      extractor.score_cluster_rev_2(\
                          cdict[x], centroids[x], reviews), x))

        ret = []

        for x,y,z in sorted(t, key=lambda x:x[1], reverse=True):
            ret.append((x, y, cdict[z][:5]))

        return ret

    res = extract_property(
            parser.parse_reviews(crawler.get_item_reviews(5749367460)))

    assert len(res) > 8
    assert res[0][1] > 23.0
Example #3
0
time.sleep(5)

#db = MySqlOperator(server='127.0.0.1', user_name='root', password='', dbname='taobao_sf')
city = '三门峡'
for cate_info in cate_list:
    cate_id = cate_info['id']
    cate_name = cate_info['name']
    print(cate_name)
    page = 1
    while True:
        url = 'https://h5api.m.taobao.com/h5/mtop.taobao.govauctionmtopcommonservice.getfrontcategory/1.0/?jsv=2.4.5&appKey=12574478&t=1570096614606&api=mtop.taobao.govauctionmtopcommonservice.getfrontcategory'
        headers = {
            'user-agent':
            'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36'
        }
        crawler = Crawler()
        proxy_ip = random.choice(PROXY_LIST)
        proxies = {
            "https": 'https://' + proxy_ip,
            "http": 'http://' + proxy_ip
        }
        res, session = crawler.crawl(url=url, headers=headers, proxies=proxies)
        cookies = res.cookies.get_dict()
        m_h5_tk = cookies['_m_h5_tk']
        app_key = '12574478'
        data = '{"city":"%s","pageNo":%s,"pageSize":99,"orderId":"0","categoryId":"%s"}' % (
            city, str(page), cate_id)
        sign, t = get_sign(m_h5_tk, app_key, data)
        params = {
            'jsv': '2.4.5',
            'appKey': app_key,
from dynamodb import DB
import json
from crawl import Crawler
import threading

#Test Parameters
with open('params.json', 'r') as file:
    json_sns = file.read()

# DB().fill_empty_restaurant(json.loads(json_sns))
t = threading.Thread(name="Crawler",
                     target=DB().upload_food_list(
                         Crawler(json.loads(json_sns)).query(10)))
t.start()
t.join()
print("NEXT")
        if self.filename:
            f = open(self.filename, 'w')
            f.write(self.output_string)
            f.close()
        else:
            print self.output_string

    def print_tree(self, tree, level):
        self.output('<li><a href="' + tree.url + '">' + tree.url + '</a></li>',
                    level)
        if tree.statics:
            self.output('<b>Static resources:</b>', level)
            self.output('<ul>', level)
            for s in tree.statics:
                self.output('<li>' + s[0] + ': <a href="' + s[1] +
                            '">' + s[1] + '</a></li>', level)
            self.output('</ul>', level)
        if tree.children:
            self.output('<b>Children:</b>', level)
            self.output('<ul>', level)
            for c in tree.children:
                self.print_tree(c, level + 1)
            self.output('</ul>', level)

starttime = time.time()
crawler = Crawler(args.domain)
c = crawler.crawl_domain()
endtime = time.time()
p = Parser(c, args.file)
p.render_html(endtime - starttime)
Example #6
0
def main():
    crawler = Crawler("http://start.bg/")
    database = Database()
    crawler.start()
Example #7
0
single_row = RowParser(rows[0])
single_row.extract_fields()
print(single_row.extracted_content)

print("--------Wyciąganie danych ze wszystkich rezultatów--------------")

results = []
for i in rows:
    single_row = RowParser(i)

    single_row.extract_fields()
    results.append(single_row.extracted_content)

print(results)
print(f"length of results: {len(results)}")

print("test nowej klasy")
parser_ = Parser(a.page_content)
parser_.extract_fields()
print(parser_.results)

print(parser_._log)

print("test master obiektu ")

crawler = Crawler(search_params)
crawler.get_all()

print(crawler.results)
print(crawler.log)
Example #8
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
Example #9
0
 async def invalid_url():
     url = 'http://yoyowalletxxxx.com'
     crawler = Crawler('')
     result = await crawler.get_body(url)
     self.assertEqual(result, '')
Example #10
0
 async def valid_url():
     url = 'http://yoyowallet.com'
     crawler = Crawler('')
     result = await crawler.get_body(url)
     self.assertTrue(result)
Example #11
0
from crawl import Crawler
import json
from common import get_regex
from tools.mysql_operator import MySqlOperator

url = 'https://sf.taobao.com/item_list.htm?city=&province=%D5%E3%BD%AD'
crawler = Crawler()
res, session = crawler.crawl(url=url, encoding='gbk')
raw_data = get_regex(
    r'<script id="sf-item-list-data" type="text/json">([\S\s]*?)</script>',
    res.text, 1)
jdata = json.loads(raw_data)

data_list = list()
for item in jdata['data']:
    item_info = {'id': item['id'], 'title': item['title']}
    data_list.append(item_info)

db = MySqlOperator(server='127.0.0.1',
                   user_name='root',
                   password='',
                   dbname='taobao_sf')
db.bulk_insert('test_tb', data_list)