def main(spider_class, settings_module): crawler = Crawler(spider_class, settings_module) try: joinall([spawn(crawler.process_url) for i in xrange(5)]) except KeyboardInterrupt: crawler.clear(False) except: logger.exception("Unable to complete") else: crawler.clear(True) logger.info("Crawling completed")
def test_whole(): extractor = OpinionExtractor('word2vec/clean.partial.6pos.w3.m10.model') clusterer = Clusterer('word2vec/clean.partial.6pos.w3.m10.model') parser = MorphAnalyzer() crawler = Crawler(100) def extract_property(reviews): props = [] for r in reviews: props += extractor.find_props(r) props = list(set(props)) aprop, visual = clusterer.cluster_props(props) cdict = clusterer.cluster_as_dict(aprop, visual) centroids = clusterer.get_centroid(cdict) t = [] for x in centroids.keys(): t.append((centroids[x],\ extractor.score_cluster_rev_2(\ cdict[x], centroids[x], reviews), x)) ret = [] for x,y,z in sorted(t, key=lambda x:x[1], reverse=True): ret.append((x, y, cdict[z][:5])) return ret res = extract_property( parser.parse_reviews(crawler.get_item_reviews(5749367460))) assert len(res) > 8 assert res[0][1] > 23.0
time.sleep(5) #db = MySqlOperator(server='127.0.0.1', user_name='root', password='', dbname='taobao_sf') city = '三门峡' for cate_info in cate_list: cate_id = cate_info['id'] cate_name = cate_info['name'] print(cate_name) page = 1 while True: url = 'https://h5api.m.taobao.com/h5/mtop.taobao.govauctionmtopcommonservice.getfrontcategory/1.0/?jsv=2.4.5&appKey=12574478&t=1570096614606&api=mtop.taobao.govauctionmtopcommonservice.getfrontcategory' headers = { 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36' } crawler = Crawler() proxy_ip = random.choice(PROXY_LIST) proxies = { "https": 'https://' + proxy_ip, "http": 'http://' + proxy_ip } res, session = crawler.crawl(url=url, headers=headers, proxies=proxies) cookies = res.cookies.get_dict() m_h5_tk = cookies['_m_h5_tk'] app_key = '12574478' data = '{"city":"%s","pageNo":%s,"pageSize":99,"orderId":"0","categoryId":"%s"}' % ( city, str(page), cate_id) sign, t = get_sign(m_h5_tk, app_key, data) params = { 'jsv': '2.4.5', 'appKey': app_key,
from dynamodb import DB import json from crawl import Crawler import threading #Test Parameters with open('params.json', 'r') as file: json_sns = file.read() # DB().fill_empty_restaurant(json.loads(json_sns)) t = threading.Thread(name="Crawler", target=DB().upload_food_list( Crawler(json.loads(json_sns)).query(10))) t.start() t.join() print("NEXT")
if self.filename: f = open(self.filename, 'w') f.write(self.output_string) f.close() else: print self.output_string def print_tree(self, tree, level): self.output('<li><a href="' + tree.url + '">' + tree.url + '</a></li>', level) if tree.statics: self.output('<b>Static resources:</b>', level) self.output('<ul>', level) for s in tree.statics: self.output('<li>' + s[0] + ': <a href="' + s[1] + '">' + s[1] + '</a></li>', level) self.output('</ul>', level) if tree.children: self.output('<b>Children:</b>', level) self.output('<ul>', level) for c in tree.children: self.print_tree(c, level + 1) self.output('</ul>', level) starttime = time.time() crawler = Crawler(args.domain) c = crawler.crawl_domain() endtime = time.time() p = Parser(c, args.file) p.render_html(endtime - starttime)
def main(): crawler = Crawler("http://start.bg/") database = Database() crawler.start()
single_row = RowParser(rows[0]) single_row.extract_fields() print(single_row.extracted_content) print("--------Wyciąganie danych ze wszystkich rezultatów--------------") results = [] for i in rows: single_row = RowParser(i) single_row.extract_fields() results.append(single_row.extracted_content) print(results) print(f"length of results: {len(results)}") print("test nowej klasy") parser_ = Parser(a.page_content) parser_.extract_fields() print(parser_.results) print(parser_._log) print("test master obiektu ") crawler = Crawler(search_params) crawler.get_all() print(crawler.results) print(crawler.log)
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
async def invalid_url(): url = 'http://yoyowalletxxxx.com' crawler = Crawler('') result = await crawler.get_body(url) self.assertEqual(result, '')
async def valid_url(): url = 'http://yoyowallet.com' crawler = Crawler('') result = await crawler.get_body(url) self.assertTrue(result)
from crawl import Crawler import json from common import get_regex from tools.mysql_operator import MySqlOperator url = 'https://sf.taobao.com/item_list.htm?city=&province=%D5%E3%BD%AD' crawler = Crawler() res, session = crawler.crawl(url=url, encoding='gbk') raw_data = get_regex( r'<script id="sf-item-list-data" type="text/json">([\S\s]*?)</script>', res.text, 1) jdata = json.loads(raw_data) data_list = list() for item in jdata['data']: item_info = {'id': item['id'], 'title': item['title']} data_list.append(item_info) db = MySqlOperator(server='127.0.0.1', user_name='root', password='', dbname='taobao_sf') db.bulk_insert('test_tb', data_list)