Exemple #1
0
def thread_func(filename,cur):
    c = Crawl()
    
    #读取文件
    f = open('uploads/'+filename,'r')
    i = 1
    while 1:
        print(cur,i)
       
        line = f.readline().strip('\n')
        if i<=cur:
            i = i+1
            continue
        rs = Setting.query.filter_by(name='is_crawl').first()
        
        if rs.value == '0':
            break
        if not line:
            break
        time.sleep(1)
        flag = c.crawl(line)
        
        if flag:
            db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="2",opt_time=int(time.time())))
            db.session.commit()
        else:
            db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="1",opt_time=int(time.time())))
            db.session.commit()
        pass # do something
    f.close()
Exemple #2
0
 def __init__(self):
     self.crawl = Crawl()
     self.analysis = Analysis()
     self.pipe = Pipeline()
     self._use_log()
     try:
         self.args_dict = eval(sys.argv[1:])
         if not isinstance(self.args_dict, dict):
             raise ValueError('args must be like key-value ')
     except Exception as e:
         self.args_dict = {}
         logging.warning('get args failed:{}'.format(e))
     self.proxies = self.args_dict.get('proxies')  # 代理配置
     self.hdfs = self.args_dict.get('hdfs', {})  # hdfs配置
     # 如果没有这两个参数 直接报异常 不执行
     if not self.hdfs or not self.proxies:
         raise ValueError('args not have hdfs or proxies')
     self.sleep_time = self.args_dict.get('sleep_time', 0.2)  # 休眠时间
     self.service_args = self.args_dict.get('service_args',
                                            {})  # PhantomJS代理配置
     self.aliyun_log = self.args_dict.get('aliyun_log', {})
     self.alilog = AliyunLog(
         '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME),
         endp=self.aliyun_log.get('endpoint', endpoint),
         accid=self.aliyun_log.get('accessKeyId', accessKeyId),
         acckey=self.aliyun_log.get('accessKey', accessKey),
         proj=self.aliyun_log.get('project', project),
         logst=self.aliyun_log.get('logstore',
                                   logstore))  # 阿里云log配置文件,需要校验如果没有该参数会不会报错
     try:
         self.HDFS = HDFileSystem(host=self.hdfs.get(
             'ip', '192.168.100.178'),
                                  port=self.hdfs.get('port', 8020))
     except:
         pass
Exemple #3
0
    def start_crawl(self):
        # Validation for the inputs. (if i got time)

        # Start the crawl
        Crawl(self.website_url_input.get(), self.crawl_depth_input.get(),
              self.user_defined_regex_input.get())
        print("Crawl finished")
Exemple #4
0
 def crawl(self):
     crawl = Crawl()
     proxies = []
     self.logger.info('crawl beginning -------')
     for parser in PARSER_LIST:
         for url in parser['urls']:
             self.logger.info('crawling {0}'.format(url))
             result = crawl.run(url, parser)
             proxies.extend(result)
     self.logger.info('crawl end -------\n'
                      'crawl {0} ips'.format(len(proxies)))
     return proxies
Exemple #5
0
    def __init__(self, goal, time):
        '''goal = 今日订阅目标(增加量
            time = 刷新时间 (分钟)'''
        self.goal = goal
        self.time_in_seconds = time * 60
        self.c = Crawl(goal)  # 初始化Crawler
        # 设置GUI界面
        self.root = Tk()

        ###########################     设置初始windows位置 ##################
        self.root.geometry('220x45+40+560')  # 长 X  宽  + 向右平移 + 向下平移
        #####################################################################

        self.root.title('就是要莽')
        top_frame = Frame(self.root)  # 上层frame用于显示信息
        top_frame.pack(fill=BOTH)
        self.label_text1 = StringVar()
        self.label_text1.set('今日订阅:')
        text_label = Label(top_frame, textvariable=self.label_text1, font="32")
        text_label.grid(row=0, sticky='w')
        self.cur_num = StringVar()  # 当前订阅数
        num_label = Label(top_frame,
                          textvariable=self.cur_num,
                          fg="red",
                          font="28")
        num_label.grid(row=0, column=1, sticky='e')
        self.label_text2 = StringVar()
        self.label_text2.set('/' + str(self.goal))
        objective_label = Label(top_frame,
                                textvariable=self.label_text2,
                                font="28")
        objective_label.grid(row=0, column=2, sticky='w')
        top_frame.columnconfigure(0, weight=4)  # 调整widget位置
        top_frame.columnconfigure(1, weight=2)
        top_frame.columnconfigure(2, weight=2)

        bottom_frame = Frame(self.root)  # 下层frame用于手动获取最新订阅量
        bottom_frame.pack(fill=BOTH, side=BOTTOM)
        refresh_button = Button(bottom_frame, text='手动刷新', font="25")
        refresh_button.bind('<Button-1>', self.refresh)
        refresh_button.grid(row=0, column=0, sticky=("N", "S", "E", "W"))
        fans_button = Button(bottom_frame, text='当前订阅', font="25")
        fans_button.bind('<Button-1>', self.refresh_total_fans)
        fans_button.grid(row=0, column=1, sticky=("N", "S", "E", "W"))
        bottom_frame.columnconfigure(0, weight=1)
        bottom_frame.columnconfigure(1, weight=1)
        self.root.rowconfigure(0, weight=3)  # 调整widget位置
        self.root.rowconfigure(1, weight=1)

        t = threading.Thread(target=self.start_crawl)  # 开始运行
        t.daemon = True
        t.start()
        self.root.mainloop()
Exemple #6
0
def wrap_crawl(url, threads, user_agent, proxy, timeout, obey_robots, max_urls,
               data_format):
    freeze_support()
    seo = Crawl(url,
                threads=threads,
                user_agent=user_agent,
                proxy=proxy,
                timeout=timeout,
                obey_robots=obey_robots,
                max_urls=max_urls,
                data_format=data_format)
    seo.run_crawler()
Exemple #7
0
 def __init__(self, name):
     self.crawl = Crawl()
     self.analysis = Analysis()
     self.pipe = Pipeline()
     self.options = webdriver.ChromeOptions()
     # 指定下载位置
     prefs = {
         'profile.default_content_settings.popups': 0,
         'download.default_directory': os.path.abspath('DATA')
     }
     self.options.add_experimental_option('prefs', prefs)
     self.driver = webdriver.Chrome(chrome_options=self.options)
     self.name = str(name.encode('gbk'))[2:-1].replace('\\x', '%').upper()
Exemple #8
0
    def __init__(self):
        self.count = {
            'count': 0,  #爬取总数
            'failed_count': 0,  #爬取失败总数
            'sucess_count': 0,  #成功爬取总数
            'start_time': time.asctime(),  #开始时间
            'end_time': 0,  #结束时间
        }
        self.endtime = time.localtime().tm_min + 1
        self.proxy = next(proxies)
        self.Crawl = Crawl()
        self.Crawl.proxy = self.proxy

        self.Taskqueue = Queue()
        self.Urlqueue = Queue()
 def evaluation_chart(self):
     # 销量榜数据表名称
     sales_volume_rankings_table_name = 'sales_volume_rankings'
     # 热评榜数据表名称
     heat_rankings_table_name = 'heat_rankings'
     # 创建自定义数据库对象
     mysql = MySQL()
     # 创建爬去对象
     mycrawl = Crawl()
     # 连接数据库
     sql = mysql.connection_sql()
     # 创建游标
     cur = sql.cursor()
     good_rate_list = [] # 好评率列表
     # 查询关注图书的信息,中的京东id
     attention_message = mysql.query_attention(cur, 'jd_id,book_name',
                                               sales_volume_rankings_table_name, "attention = '1'")
     for i in range(len(attention_message)):
         # 获取好评率与评价时间
         good_rate,time = mycrawl.get_evaluation(0, attention_message[i][0])
         # 将关注的商品名称与好评率添加至列表当中
         good_rate_list.append((attention_message[i][1], good_rate))
         # 关注的第一个商品
         if i == 0:
             plt1 = PlotCanvas() # 创建如表画布类对象
             # 显示评价分析图
             plt1.pie_chart(good_rate_list[0][1],
                            (100 - good_rate_list[0][1]), good_rate_list[0][0])
             # 将评价分析图添加至布局中
             self.horizontalLayout_0.addWidget(plt1)
         # 关注的第二个商品
         if i == 1:
             plt2 = PlotCanvas()
             plt2.pie_chart(good_rate_list[1][1],
                            (100 - good_rate_list[1][1]), good_rate_list[1][0])
             self.horizontalLayout_1.addWidget(plt2)
         # 关注的第三个商品
         if i == 2:
             plt3 =PlotCanvas()
             plt3.pie_chart(good_rate_list[2][1],
                            (100 - good_rate_list[2][1]),good_rate_list[2][0])
             self.horizontalLayout_2.addWidget(plt3)
     mysql.close_sql() # 关闭数据库
 def crawl_name(self, item_id_inner, proxy_inner, mall_id_inner):
     if mall_id_inner == '1':  # jd
         crawl = Crawl()
         item_name_inner = crawl.get_name_jd(item_id_inner, proxy_inner)
         return item_name_inner
     elif mall_id_inner == '2':  # tm
         #crawl = Crawl()
         #item_name_inner = crawl.get_name_tm(item_id_inner, proxy_inner)
         #return item_name_inner
         temp_item_name = '天猫价格抓取正在攻克中,名称暂不显示'
         return temp_item_name
     elif mall_id_inner == '3':  # tb
         #crawl = Crawl()
         #item_name_inner = crawl.get_name_tb(item_id_inner, proxy_inner)
         #return item_name_inner
         temp_item_name = '淘宝价格抓取正在攻克中,名称暂不显示'
         return temp_item_name
     else:
         return '该商品未设定商城名'
 def crawl_price(self, item_id_inner, proxy_inner, mall_id_inner):
     if mall_id_inner == '1':
         crawl = Crawl()
         item_price_inner = crawl.get_price_jd(item_id_inner, proxy_inner)
         return item_price_inner
     elif mall_id_inner == '2':
         #crawl = Crawl()
         #item_price_inner = crawl.get_price_tm(item_id_inner, proxy_inner)
         #return item_price_inner
         temp_item_price = '-1'
         return temp_item_price
     elif mall_id_inner == '3':
         #crawl = Crawl()
         #item_price_inner = crawl.get_price_tb(item_id_inner, proxy_inner)
         #return item_price_inner
         temp_item_price = '-1'
         return temp_item_price
     else:
         return '-1'
Exemple #12
0
def main():
    try:
        name = prompt()
        # create authenticated twitter api object
        auth = authenticate.Authenticate(creds_file='twitter_creds.BU')
        # crawl the given twitter profile for reciprocal friends
        crawl = Crawl(twitter_api=auth.twitter_api,
                      screen_name=name,
                      node_max=100)
        # crawl = Crawl(twitter_api=auth.twitter_api, screen_name='smerconish', node_max=100)
        crawl.crawl_followers()
        crawl.file_output.close()  # close file

        #create a graph object using networkx and visualize it using graphviz
        g = Graph(use_name=True,
                  twitter_api=auth.twitter_api,
                  screen_name=name)

    except Exception as e:
        print(traceback.format_exc())
Exemple #13
0
    def __init__(self, root="."):
        self.search_path = Crawl(root)
        self.version = ''
        self.cache = None

        self.engines = copy.deepcopy(engine_registry)
        self.mimetypes = copy.deepcopy(mimetype_registry)
        self.processors = copy.deepcopy(processor_registry)

        class ctx(Context):
            pass

        self.context_class = ctx

        for path in path_registry.paths:
            self.search_path.append_path(path)

        for extension in self.mimetypes.mimetypes.keys():
            self.search_path.append_extension(extension)

        for ext, engine in self.engines.engines.iteritems():
            self.add_engine_to_search_path(ext, engine)
Exemple #14
0
def get_data(usr_id, token):
    sys.path.append('../')
    from crawl import Crawl
    import time
    c = Crawl()
    print 'Start web crawl.'
    c.update([usr_id], token_list=[token])
    c.update_img([usr_id], token_list=[token])
    c.update_voice([usr_id], token_list=[token])
    print 'Crawl is finished.'

    print 'Start analysis.'
    #os.system('java -Djava.ext.dirs=../../predict/lib -jar ../../predict/predictor.jar ../../analysis/data_json/'+usr_id)
    os.system(
        'java -Djava.ext.dirs=./lib -jar predictor.jar ../../analysis/data_json/'
        + usr_id)
    print 'Analysis is finished.'

    global five_result
    #with open('../../predict/predict_result/'+usr_id+'.txt') as ifile:
    with open('predict_result/' + usr_id + '.txt') as ifile:
        five_result = eval(ifile.read())
    global finished
    finished = True
Exemple #15
0
    def __init__(self, email='', rate=60, note=60 * 60):
        config='config.cfg'
        cfg = configparser.ConfigParser()
        parentDirPath=os.path.dirname(os.path.abspath(__file__))
        path=parentDirPath+'/config/'+config 
        cfg.read(path)
        self.option = cfg.get('select', 'option')
        self.scheduler = sched.scheduler(time.time, time.sleep)
        self.goods_dict = {}
        self.db = DB()
        self.crawl = Crawl()
        self.mail = Mail()
        self.ding = Dingding()
        self.email = [email]  # 电子邮箱
        self.rate = rate  # 刷新频率
        self.note = note  # 通知频率

        # 加载数据
        result = self.db.query()
        print('----------加载数据----------')
        for id, item in result.items():
            self.goods_dict[id] = Goods(item['id'], item['want'], item['status'], item['dname'])
            print(self.goods_dict[id].__dict__)
        print('----------加载完成----------')
Exemple #16
0
def run():
    crawler = Crawl()
    vips = crawler.all_come_to_bowl()
    print_vips(vips)
Exemple #17
0
 def crawl_name_price(self, item_id):
     crawl = Crawl()
     item_price_inner = crawl.get_price(item_id)
     item_name_inner = crawl.get_name(item_id)
     return item_name_inner, item_price_inner
Exemple #18
0
try:
    from local_settings import *
except ImportError:
    pass


def continue_load():
    reload = raw_input('已经抓取%s 条数据是否继续(Y/N):' % len(crawl.all_data))
    if str(reload) == 'Y':
        try:
            crawl.selenium_page()
        except Exception as e:
            continue_load()
    else:
        pass


if __name__ == '__main__':
    try:
        crawl = Crawl(DEFAULT_URL, DEFAULT_ID, CHROMEDRIVER_URL, XLWT_URL)
        crawl.open()
        crawl.run()
        print('抓取完成, 已经抓取%s 条数据' % len(crawl.all_data))
    except Exception as e:
        continue_load()

    title = raw_input('是否输出为表格并输入名称 ( 关闭 ctrl + c ): ')
    if len(title):
        crawl.write_xlwt(str(title))
Exemple #19
0
 def __init__(self):
     self.c=Crawl()
     self.e=Excel()
Exemple #20
0
    def __init__(self, goal, time, room_id):
        '''goal = 今日订阅目标(增加量
            time = 刷新时间 (分钟)
            room_id = 直播间ID'''
        self.goal = goal
        self.time_in_seconds = time * 60
        self.today_maximum = -1  # 今日最高订阅数
        self.c = Crawl(goal, str(room_id))  # 初始化Crawler
        # 设置GUI界面
        self.root = Tk()
        self.root.configure(background='#f3b513')
        if os.path.isfile('doorbell.wav'):  # 在载入音乐前检查音乐是否存在
            self.has_music = True
        else:
            self.has_music = False
        ###########################     设置初始windows位置 ##################
        self.root.geometry('200x37+21+733')  # 长 X  宽  + 向右平移 + 向下平移
        #####################################################################

        self.root.title('太阳')
        left_frame = Frame(self.root, background='#f3b513')  # 左边frame用于显示信息
        left_frame.grid(row=0, column=0)
        self.label_text1 = StringVar()
        self.label_text1.set('今日订阅:')
        text_label = Label(left_frame,
                           textvariable=self.label_text1,
                           font="32",
                           background='#f3b513')
        text_label.grid(row=0, sticky='w')
        self.cur_num = StringVar()  # 当前订阅数
        num_label = Label(left_frame,
                          textvariable=self.cur_num,
                          fg="red",
                          font="28",
                          background='#f3b513')
        num_label.grid(row=0, column=1, sticky='e')
        self.label_text2 = StringVar()
        self.label_text2.set('/' + str(self.goal))
        objective_label = Label(left_frame,
                                textvariable=self.label_text2,
                                font="28",
                                background='#f3b513')
        objective_label.grid(row=0, column=2, sticky='w')

        right_frame = Frame(self.root,
                            background='#f3b513')  # 右边frame用于手动获取最新订阅量和当前订阅人数
        right_frame.grid(row=0, column=1)
        # bottom_frame.pack(fill=BOTH, side=BOTTOM)
        refresh_button = Button(right_frame,
                                text='刷新',
                                font="25",
                                background='#f3b513')
        refresh_button.bind('<Button-1>', self.refresh)
        refresh_button.grid(row=0,
                            column=0,
                            sticky=("N", "S", "E", "W"),
                            padx=4,
                            pady=4)
        fans_button = Button(right_frame,
                             text='总订',
                             font="25",
                             background='#f3b513')
        fans_button.bind('<Button-1>', self.refresh_total_fans)
        fans_button.grid(row=0,
                         column=1,
                         sticky=("N", "S", "E", "W"),
                         padx=4,
                         pady=4)
        right_frame.columnconfigure(0, weight=1)
        right_frame.columnconfigure(1, weight=1)
        self.root.columnconfigure(0, minsize=50)
        self.root.columnconfigure(1, weight=1)  # 调整widget位置
        t = threading.Thread(target=self.start_crawl)  # 开始运行
        t.daemon = True
        t.start()
        self.root.mainloop()
Exemple #21
0
#encoding=utf-8
from crawl import Crawl

useridFile = open("userid.txt", 'r')
userid = useridFile.read().strip()
useridFile.close()

open("result.txt", 'w').close()

c = Crawl()

print "Job Started ...\n"
page = 1
url = c.host + '/' + userid + '/myfans?t=4&page=' + str(page)
while (c.run(url)):
    print "fans in page " + str(page) + "\n"
    page += 1
    url = c.host + '/' + userid + '/myfans?t=4&page=' + str(page)

print "Done!\n"
Exemple #22
0
def crawl_torrent(dir):
    print "crawl torrent"
    crawl = Crawl(dir.dir_name)
    crawl.start(dir.fid, 1, 5, 5)

    do_zip(dir)
Exemple #23
0
def get_result():
    logger.info(request.args['keywords'])
    crawl = Crawl()
    result = crawl.crawl(request.args['keywords'])
    logger.info(result)
    return render_template('result.html', result=result)
Exemple #24
0
import atexit
from crawl import Crawl

# max uid 703222999
MAX_UID = 703222999

if __name__ == '__main__':
    crawler = Crawl()
    atexit.register(crawler.save_data)

    start_id = input("start uid[1]: ")
    if not start_id:
        start_id = 1
    else:
        start_id = int(start_id)

    end_id = input("end uid[MAX]:")
    if not end_id:
        end_id = MAX_UID
    else:
        end_id = int(end_id)

    show_status = input("show requests?(yes/[no]):")
    if not show_status or show_status == 'no':
        show_status = False
    else:
        show_status = True

    crawler.start_crawling(start_id, min(end_id, MAX_UID), show_status)
Exemple #25
0
 def __init__(self):
     self.crawl = Crawl()
     self.analysis = Analysis()
     self.pipe = Pipeline()
Exemple #26
0
        f.write('{} {}'.format(page_number, article_number))
        f.flush()


def get_break_point(filename):
    """获取断点值
    """
    f = open(filename, 'r')
    v = f.read()
    f.close()
    return [int(x) for x in v.split(' ')]


if __name__ == '__main__':
    key_word = "智慧图书馆"
    craw = Crawl()
    craw.init_cookies()
    page_number = 1  # 起始抓取页数
    article_number = 0  # 处理文章数量
    break_point_filename = "./break_point.dat"
    if os.path.isfile(break_point_filename):
        page_number, article_number = get_break_point(break_point_filename)
    # 保存文章列表文件名
    articles_filename = "articles.csv"
    if not os.path.isfile(articles_filename):
        # 文件不存在, 创建新模板
        print("初始化模板文件.")
        fp = open(articles_filename, 'w')
        fp.write('title;summary;article_uri;account_name\n')
        fp.close()
    with open(articles_filename, 'a') as fp:
Exemple #27
0
# 关于窗体初始化类
class About_Window(QMainWindow, About_MainWindow):
    def __init__(self):
        super(About_Window, self).__init__()
        self.setupUi(self)

    # 打开窗体
    def open(self):
        self.show()


if __name__ == "__main__":
    # 创建自定义数据库对象
    mysql = MySQL()
    # 创建爬去对象
    mycrawl = Crawl()
    # 连接数据库
    sql = mysql.connection_sql()
    # 创建游标
    cur = sql.cursor()

    app = QApplication(sys.argv)
    # 主窗体对象
    main = Main()
    # 显示主窗体
    main.show()
    # 销量排行窗体对象
    sales = Sales()
    # 热评排行窗体对象
    heat = Heat()
Exemple #28
0
def crawl(args):
    if args.crawl:
        crawler = Crawl(args)
        crawler.crawl()
Exemple #29
0
from crawl import Crawl
from crawley import Crawley

crawley = Crawley()
crawley.welcome() 
url, levels, user_defined_regex = crawley.user_input()
crawl = Crawl(url, levels, user_defined_regex)

crawl.perform_crawl()
#crawl.test_variables()
crawl.save_report()
crawley.report()

while True:
    if crawley.crawl_another() == True:
        url, levels, user_defined_regex = crawley.user_input()
        crawl = Crawl(url, levels, user_defined_regex)

        crawl.perform_crawl()
        #crawl.test_variables()
        crawl.save_report()
        crawley.report()
    else:
        crawley.goodbye()
        break
def main():
    for id in ID_LIST:
        c = Crawl("https://vod.gate.panda.tv/api/hostvideos", id)
        c.start()
        time.sleep(30 * 60)