def thread_func(filename,cur): c = Crawl() #读取文件 f = open('uploads/'+filename,'r') i = 1 while 1: print(cur,i) line = f.readline().strip('\n') if i<=cur: i = i+1 continue rs = Setting.query.filter_by(name='is_crawl').first() if rs.value == '0': break if not line: break time.sleep(1) flag = c.crawl(line) if flag: db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="2",opt_time=int(time.time()))) db.session.commit() else: db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="1",opt_time=int(time.time()))) db.session.commit() pass # do something f.close()
def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() self._use_log() try: self.args_dict = eval(sys.argv[1:]) if not isinstance(self.args_dict, dict): raise ValueError('args must be like key-value ') except Exception as e: self.args_dict = {} logging.warning('get args failed:{}'.format(e)) self.proxies = self.args_dict.get('proxies') # 代理配置 self.hdfs = self.args_dict.get('hdfs', {}) # hdfs配置 # 如果没有这两个参数 直接报异常 不执行 if not self.hdfs or not self.proxies: raise ValueError('args not have hdfs or proxies') self.sleep_time = self.args_dict.get('sleep_time', 0.2) # 休眠时间 self.service_args = self.args_dict.get('service_args', {}) # PhantomJS代理配置 self.aliyun_log = self.args_dict.get('aliyun_log', {}) self.alilog = AliyunLog( '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME), endp=self.aliyun_log.get('endpoint', endpoint), accid=self.aliyun_log.get('accessKeyId', accessKeyId), acckey=self.aliyun_log.get('accessKey', accessKey), proj=self.aliyun_log.get('project', project), logst=self.aliyun_log.get('logstore', logstore)) # 阿里云log配置文件,需要校验如果没有该参数会不会报错 try: self.HDFS = HDFileSystem(host=self.hdfs.get( 'ip', '192.168.100.178'), port=self.hdfs.get('port', 8020)) except: pass
def start_crawl(self): # Validation for the inputs. (if i got time) # Start the crawl Crawl(self.website_url_input.get(), self.crawl_depth_input.get(), self.user_defined_regex_input.get()) print("Crawl finished")
def crawl(self): crawl = Crawl() proxies = [] self.logger.info('crawl beginning -------') for parser in PARSER_LIST: for url in parser['urls']: self.logger.info('crawling {0}'.format(url)) result = crawl.run(url, parser) proxies.extend(result) self.logger.info('crawl end -------\n' 'crawl {0} ips'.format(len(proxies))) return proxies
def __init__(self, goal, time): '''goal = 今日订阅目标(增加量 time = 刷新时间 (分钟)''' self.goal = goal self.time_in_seconds = time * 60 self.c = Crawl(goal) # 初始化Crawler # 设置GUI界面 self.root = Tk() ########################### 设置初始windows位置 ################## self.root.geometry('220x45+40+560') # 长 X 宽 + 向右平移 + 向下平移 ##################################################################### self.root.title('就是要莽') top_frame = Frame(self.root) # 上层frame用于显示信息 top_frame.pack(fill=BOTH) self.label_text1 = StringVar() self.label_text1.set('今日订阅:') text_label = Label(top_frame, textvariable=self.label_text1, font="32") text_label.grid(row=0, sticky='w') self.cur_num = StringVar() # 当前订阅数 num_label = Label(top_frame, textvariable=self.cur_num, fg="red", font="28") num_label.grid(row=0, column=1, sticky='e') self.label_text2 = StringVar() self.label_text2.set('/' + str(self.goal)) objective_label = Label(top_frame, textvariable=self.label_text2, font="28") objective_label.grid(row=0, column=2, sticky='w') top_frame.columnconfigure(0, weight=4) # 调整widget位置 top_frame.columnconfigure(1, weight=2) top_frame.columnconfigure(2, weight=2) bottom_frame = Frame(self.root) # 下层frame用于手动获取最新订阅量 bottom_frame.pack(fill=BOTH, side=BOTTOM) refresh_button = Button(bottom_frame, text='手动刷新', font="25") refresh_button.bind('<Button-1>', self.refresh) refresh_button.grid(row=0, column=0, sticky=("N", "S", "E", "W")) fans_button = Button(bottom_frame, text='当前订阅', font="25") fans_button.bind('<Button-1>', self.refresh_total_fans) fans_button.grid(row=0, column=1, sticky=("N", "S", "E", "W")) bottom_frame.columnconfigure(0, weight=1) bottom_frame.columnconfigure(1, weight=1) self.root.rowconfigure(0, weight=3) # 调整widget位置 self.root.rowconfigure(1, weight=1) t = threading.Thread(target=self.start_crawl) # 开始运行 t.daemon = True t.start() self.root.mainloop()
def wrap_crawl(url, threads, user_agent, proxy, timeout, obey_robots, max_urls, data_format): freeze_support() seo = Crawl(url, threads=threads, user_agent=user_agent, proxy=proxy, timeout=timeout, obey_robots=obey_robots, max_urls=max_urls, data_format=data_format) seo.run_crawler()
def __init__(self, name): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline() self.options = webdriver.ChromeOptions() # 指定下载位置 prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': os.path.abspath('DATA') } self.options.add_experimental_option('prefs', prefs) self.driver = webdriver.Chrome(chrome_options=self.options) self.name = str(name.encode('gbk'))[2:-1].replace('\\x', '%').upper()
def __init__(self): self.count = { 'count': 0, #爬取总数 'failed_count': 0, #爬取失败总数 'sucess_count': 0, #成功爬取总数 'start_time': time.asctime(), #开始时间 'end_time': 0, #结束时间 } self.endtime = time.localtime().tm_min + 1 self.proxy = next(proxies) self.Crawl = Crawl() self.Crawl.proxy = self.proxy self.Taskqueue = Queue() self.Urlqueue = Queue()
def evaluation_chart(self): # 销量榜数据表名称 sales_volume_rankings_table_name = 'sales_volume_rankings' # 热评榜数据表名称 heat_rankings_table_name = 'heat_rankings' # 创建自定义数据库对象 mysql = MySQL() # 创建爬去对象 mycrawl = Crawl() # 连接数据库 sql = mysql.connection_sql() # 创建游标 cur = sql.cursor() good_rate_list = [] # 好评率列表 # 查询关注图书的信息,中的京东id attention_message = mysql.query_attention(cur, 'jd_id,book_name', sales_volume_rankings_table_name, "attention = '1'") for i in range(len(attention_message)): # 获取好评率与评价时间 good_rate,time = mycrawl.get_evaluation(0, attention_message[i][0]) # 将关注的商品名称与好评率添加至列表当中 good_rate_list.append((attention_message[i][1], good_rate)) # 关注的第一个商品 if i == 0: plt1 = PlotCanvas() # 创建如表画布类对象 # 显示评价分析图 plt1.pie_chart(good_rate_list[0][1], (100 - good_rate_list[0][1]), good_rate_list[0][0]) # 将评价分析图添加至布局中 self.horizontalLayout_0.addWidget(plt1) # 关注的第二个商品 if i == 1: plt2 = PlotCanvas() plt2.pie_chart(good_rate_list[1][1], (100 - good_rate_list[1][1]), good_rate_list[1][0]) self.horizontalLayout_1.addWidget(plt2) # 关注的第三个商品 if i == 2: plt3 =PlotCanvas() plt3.pie_chart(good_rate_list[2][1], (100 - good_rate_list[2][1]),good_rate_list[2][0]) self.horizontalLayout_2.addWidget(plt3) mysql.close_sql() # 关闭数据库
def crawl_name(self, item_id_inner, proxy_inner, mall_id_inner): if mall_id_inner == '1': # jd crawl = Crawl() item_name_inner = crawl.get_name_jd(item_id_inner, proxy_inner) return item_name_inner elif mall_id_inner == '2': # tm #crawl = Crawl() #item_name_inner = crawl.get_name_tm(item_id_inner, proxy_inner) #return item_name_inner temp_item_name = '天猫价格抓取正在攻克中,名称暂不显示' return temp_item_name elif mall_id_inner == '3': # tb #crawl = Crawl() #item_name_inner = crawl.get_name_tb(item_id_inner, proxy_inner) #return item_name_inner temp_item_name = '淘宝价格抓取正在攻克中,名称暂不显示' return temp_item_name else: return '该商品未设定商城名'
def crawl_price(self, item_id_inner, proxy_inner, mall_id_inner): if mall_id_inner == '1': crawl = Crawl() item_price_inner = crawl.get_price_jd(item_id_inner, proxy_inner) return item_price_inner elif mall_id_inner == '2': #crawl = Crawl() #item_price_inner = crawl.get_price_tm(item_id_inner, proxy_inner) #return item_price_inner temp_item_price = '-1' return temp_item_price elif mall_id_inner == '3': #crawl = Crawl() #item_price_inner = crawl.get_price_tb(item_id_inner, proxy_inner) #return item_price_inner temp_item_price = '-1' return temp_item_price else: return '-1'
def main(): try: name = prompt() # create authenticated twitter api object auth = authenticate.Authenticate(creds_file='twitter_creds.BU') # crawl the given twitter profile for reciprocal friends crawl = Crawl(twitter_api=auth.twitter_api, screen_name=name, node_max=100) # crawl = Crawl(twitter_api=auth.twitter_api, screen_name='smerconish', node_max=100) crawl.crawl_followers() crawl.file_output.close() # close file #create a graph object using networkx and visualize it using graphviz g = Graph(use_name=True, twitter_api=auth.twitter_api, screen_name=name) except Exception as e: print(traceback.format_exc())
def __init__(self, root="."): self.search_path = Crawl(root) self.version = '' self.cache = None self.engines = copy.deepcopy(engine_registry) self.mimetypes = copy.deepcopy(mimetype_registry) self.processors = copy.deepcopy(processor_registry) class ctx(Context): pass self.context_class = ctx for path in path_registry.paths: self.search_path.append_path(path) for extension in self.mimetypes.mimetypes.keys(): self.search_path.append_extension(extension) for ext, engine in self.engines.engines.iteritems(): self.add_engine_to_search_path(ext, engine)
def get_data(usr_id, token): sys.path.append('../') from crawl import Crawl import time c = Crawl() print 'Start web crawl.' c.update([usr_id], token_list=[token]) c.update_img([usr_id], token_list=[token]) c.update_voice([usr_id], token_list=[token]) print 'Crawl is finished.' print 'Start analysis.' #os.system('java -Djava.ext.dirs=../../predict/lib -jar ../../predict/predictor.jar ../../analysis/data_json/'+usr_id) os.system( 'java -Djava.ext.dirs=./lib -jar predictor.jar ../../analysis/data_json/' + usr_id) print 'Analysis is finished.' global five_result #with open('../../predict/predict_result/'+usr_id+'.txt') as ifile: with open('predict_result/' + usr_id + '.txt') as ifile: five_result = eval(ifile.read()) global finished finished = True
def __init__(self, email='', rate=60, note=60 * 60): config='config.cfg' cfg = configparser.ConfigParser() parentDirPath=os.path.dirname(os.path.abspath(__file__)) path=parentDirPath+'/config/'+config cfg.read(path) self.option = cfg.get('select', 'option') self.scheduler = sched.scheduler(time.time, time.sleep) self.goods_dict = {} self.db = DB() self.crawl = Crawl() self.mail = Mail() self.ding = Dingding() self.email = [email] # 电子邮箱 self.rate = rate # 刷新频率 self.note = note # 通知频率 # 加载数据 result = self.db.query() print('----------加载数据----------') for id, item in result.items(): self.goods_dict[id] = Goods(item['id'], item['want'], item['status'], item['dname']) print(self.goods_dict[id].__dict__) print('----------加载完成----------')
def run(): crawler = Crawl() vips = crawler.all_come_to_bowl() print_vips(vips)
def crawl_name_price(self, item_id): crawl = Crawl() item_price_inner = crawl.get_price(item_id) item_name_inner = crawl.get_name(item_id) return item_name_inner, item_price_inner
try: from local_settings import * except ImportError: pass def continue_load(): reload = raw_input('已经抓取%s 条数据是否继续(Y/N):' % len(crawl.all_data)) if str(reload) == 'Y': try: crawl.selenium_page() except Exception as e: continue_load() else: pass if __name__ == '__main__': try: crawl = Crawl(DEFAULT_URL, DEFAULT_ID, CHROMEDRIVER_URL, XLWT_URL) crawl.open() crawl.run() print('抓取完成, 已经抓取%s 条数据' % len(crawl.all_data)) except Exception as e: continue_load() title = raw_input('是否输出为表格并输入名称 ( 关闭 ctrl + c ): ') if len(title): crawl.write_xlwt(str(title))
def __init__(self): self.c=Crawl() self.e=Excel()
def __init__(self, goal, time, room_id): '''goal = 今日订阅目标(增加量 time = 刷新时间 (分钟) room_id = 直播间ID''' self.goal = goal self.time_in_seconds = time * 60 self.today_maximum = -1 # 今日最高订阅数 self.c = Crawl(goal, str(room_id)) # 初始化Crawler # 设置GUI界面 self.root = Tk() self.root.configure(background='#f3b513') if os.path.isfile('doorbell.wav'): # 在载入音乐前检查音乐是否存在 self.has_music = True else: self.has_music = False ########################### 设置初始windows位置 ################## self.root.geometry('200x37+21+733') # 长 X 宽 + 向右平移 + 向下平移 ##################################################################### self.root.title('太阳') left_frame = Frame(self.root, background='#f3b513') # 左边frame用于显示信息 left_frame.grid(row=0, column=0) self.label_text1 = StringVar() self.label_text1.set('今日订阅:') text_label = Label(left_frame, textvariable=self.label_text1, font="32", background='#f3b513') text_label.grid(row=0, sticky='w') self.cur_num = StringVar() # 当前订阅数 num_label = Label(left_frame, textvariable=self.cur_num, fg="red", font="28", background='#f3b513') num_label.grid(row=0, column=1, sticky='e') self.label_text2 = StringVar() self.label_text2.set('/' + str(self.goal)) objective_label = Label(left_frame, textvariable=self.label_text2, font="28", background='#f3b513') objective_label.grid(row=0, column=2, sticky='w') right_frame = Frame(self.root, background='#f3b513') # 右边frame用于手动获取最新订阅量和当前订阅人数 right_frame.grid(row=0, column=1) # bottom_frame.pack(fill=BOTH, side=BOTTOM) refresh_button = Button(right_frame, text='刷新', font="25", background='#f3b513') refresh_button.bind('<Button-1>', self.refresh) refresh_button.grid(row=0, column=0, sticky=("N", "S", "E", "W"), padx=4, pady=4) fans_button = Button(right_frame, text='总订', font="25", background='#f3b513') fans_button.bind('<Button-1>', self.refresh_total_fans) fans_button.grid(row=0, column=1, sticky=("N", "S", "E", "W"), padx=4, pady=4) right_frame.columnconfigure(0, weight=1) right_frame.columnconfigure(1, weight=1) self.root.columnconfigure(0, minsize=50) self.root.columnconfigure(1, weight=1) # 调整widget位置 t = threading.Thread(target=self.start_crawl) # 开始运行 t.daemon = True t.start() self.root.mainloop()
#encoding=utf-8 from crawl import Crawl useridFile = open("userid.txt", 'r') userid = useridFile.read().strip() useridFile.close() open("result.txt", 'w').close() c = Crawl() print "Job Started ...\n" page = 1 url = c.host + '/' + userid + '/myfans?t=4&page=' + str(page) while (c.run(url)): print "fans in page " + str(page) + "\n" page += 1 url = c.host + '/' + userid + '/myfans?t=4&page=' + str(page) print "Done!\n"
def crawl_torrent(dir): print "crawl torrent" crawl = Crawl(dir.dir_name) crawl.start(dir.fid, 1, 5, 5) do_zip(dir)
def get_result(): logger.info(request.args['keywords']) crawl = Crawl() result = crawl.crawl(request.args['keywords']) logger.info(result) return render_template('result.html', result=result)
import atexit from crawl import Crawl # max uid 703222999 MAX_UID = 703222999 if __name__ == '__main__': crawler = Crawl() atexit.register(crawler.save_data) start_id = input("start uid[1]: ") if not start_id: start_id = 1 else: start_id = int(start_id) end_id = input("end uid[MAX]:") if not end_id: end_id = MAX_UID else: end_id = int(end_id) show_status = input("show requests?(yes/[no]):") if not show_status or show_status == 'no': show_status = False else: show_status = True crawler.start_crawling(start_id, min(end_id, MAX_UID), show_status)
def __init__(self): self.crawl = Crawl() self.analysis = Analysis() self.pipe = Pipeline()
f.write('{} {}'.format(page_number, article_number)) f.flush() def get_break_point(filename): """获取断点值 """ f = open(filename, 'r') v = f.read() f.close() return [int(x) for x in v.split(' ')] if __name__ == '__main__': key_word = "智慧图书馆" craw = Crawl() craw.init_cookies() page_number = 1 # 起始抓取页数 article_number = 0 # 处理文章数量 break_point_filename = "./break_point.dat" if os.path.isfile(break_point_filename): page_number, article_number = get_break_point(break_point_filename) # 保存文章列表文件名 articles_filename = "articles.csv" if not os.path.isfile(articles_filename): # 文件不存在, 创建新模板 print("初始化模板文件.") fp = open(articles_filename, 'w') fp.write('title;summary;article_uri;account_name\n') fp.close() with open(articles_filename, 'a') as fp:
# 关于窗体初始化类 class About_Window(QMainWindow, About_MainWindow): def __init__(self): super(About_Window, self).__init__() self.setupUi(self) # 打开窗体 def open(self): self.show() if __name__ == "__main__": # 创建自定义数据库对象 mysql = MySQL() # 创建爬去对象 mycrawl = Crawl() # 连接数据库 sql = mysql.connection_sql() # 创建游标 cur = sql.cursor() app = QApplication(sys.argv) # 主窗体对象 main = Main() # 显示主窗体 main.show() # 销量排行窗体对象 sales = Sales() # 热评排行窗体对象 heat = Heat()
def crawl(args): if args.crawl: crawler = Crawl(args) crawler.crawl()
from crawl import Crawl from crawley import Crawley crawley = Crawley() crawley.welcome() url, levels, user_defined_regex = crawley.user_input() crawl = Crawl(url, levels, user_defined_regex) crawl.perform_crawl() #crawl.test_variables() crawl.save_report() crawley.report() while True: if crawley.crawl_another() == True: url, levels, user_defined_regex = crawley.user_input() crawl = Crawl(url, levels, user_defined_regex) crawl.perform_crawl() #crawl.test_variables() crawl.save_report() crawley.report() else: crawley.goodbye() break
def main(): for id in ID_LIST: c = Crawl("https://vod.gate.panda.tv/api/hostvideos", id) c.start() time.sleep(30 * 60)