def work_thread(temp):
    set = setting.objects.get(id=1)
    last_num = set.patent_num
    delay = set.scrap_delay
    logger.log('last_num: ' + last_num)
    while (get_loop()):
        num = WatchMan.is_change()
        if num == None:
            continue
        logger.log('get the num: ' + str(num[0]))
        if str(num) != last_num:
            set.patent_num = num
            set.save()
            set = setting.objects.get(id=1)
            last_num = set.patent_num
            logger.log('last_num: ' + last_num)
            logger.log("catch it !")
            #
            set_scraping(True)
            scrap()
            set_scraping(False)
            #
            set_loop(False)
        logger.log("wathman is running !" + time.strftime("%y-%m-%d %H:%M"))

        #seconds
        time.sleep(delay)
def work_thread(temp):
	set = setting.objects.get(id=1)
	last_num = set.patent_num
	delay = set.scrap_delay
	logger.log('last_num: ' + last_num)
	while (get_loop()):
		num = WatchMan.is_change()
		if num == None:
			continue
		logger.log('get the num: ' + str(num[0]))
		if str(num) != last_num:
			set.patent_num = num
			set.save()
			set = setting.objects.get(id=1)
			last_num = set.patent_num
			logger.log('last_num: ' + last_num)
			logger.log("catch it !")
			#
			set_scraping(True)
			scrap()
			set_scraping(False)
			#
			set_loop(False)
		logger.log("wathman is running !" + time.strftime("%y-%m-%d %H:%M"))

		#seconds
		time.sleep(delay)
def watch(request):
	if get_scraping() == True:
		return HttpResponse('正在抓取,无法开启监控!')
	if get_loop() == True:
		return HttpResponse('已经开启监控,请勿重复开启!')
	set_loop(True)
	thread.start_new_thread(work_thread, (1,))
	logger.log('continue')
	#等待一段时间,让线程跑起来
	time.sleep(20)
	return HttpResponse('ok')
def watch(request):
    if get_scraping() == True:
        return HttpResponse('正在抓取,无法开启监控!')
    if get_loop() == True:
        return HttpResponse('已经开启监控,请勿重复开启!')
    set_loop(True)
    thread.start_new_thread(work_thread, (1, ))
    logger.log('continue')
    #等待一段时间,让线程跑起来
    time.sleep(20)
    return HttpResponse('ok')
Exemple #5
0
 def re_transmission(self, url, opener, post_data=None, append=None):
     failed = True
     if post_data != None:
         u_post_data = {}
         for k, v in post_data.iteritems():
             u_post_data[k] = unicode(v).encode('utf-8')
         u_post_data = urllib.urlencode(u_post_data)
         if append != None:
             u_post_data += ('&channelId=' + append)
         data = None
         while failed:
             try:
                 data = opener.open(url, u_post_data).read()
                 #logger.log("try to visit url:"+url+" success !", flush=True)
                 failed = False
             except Exception, e:
                 failed = True
                 logger.log("try to visit url:" + url + " failed !")
                 logger.log(str(e), flush=True)
                 pass
Exemple #6
0
 def re_transmission(self, url, opener, post_data=None,append=None):
     failed = True
     if post_data != None:
         u_post_data = {}
         for k, v in post_data.iteritems():
             u_post_data[k] = unicode(v).encode('utf-8')
         u_post_data = urllib.urlencode(u_post_data)
         if append != None:
             u_post_data+=('&channelId='+append)
         data = None
         while failed:
             try:
                 data = opener.open(url, u_post_data).read()
                 #logger.log("try to visit url:"+url+" success !", flush=True)
                 failed = False
             except Exception, e:
                 failed = True
                 logger.log("try to visit url:"+url+" failed !")
                 logger.log(str(e), flush=True)
                 pass
Exemple #7
0
    def login(self, user='', pwd=''):
        try_login = "******" + str(
            random.random())  # 尝试登陆
        login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str(
            random.random())  # 强制登陆
        check_login = "******" + str(
            random.random())
        cnt = 3
        while (cnt > 0):
            cnt -= 1
            try:
                cj = cookielib.CookieJar()
                browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
                browser.addheaders = [
                    ('User-agent',
                     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')
                ]
                post_data = {'username': user, 'password': pwd}
                #######################
                data = self.re_transmission(try_login, browser, post_data)

                if json.loads(data)['msg'] == 'alreadylogin':
                    logger.log("account already login !", flush=True)
                    cj = cookielib.CookieJar()
                    browser = urllib2.build_opener(
                        urllib2.HTTPCookieProcessor(cj))
                    browser.addheaders = [
                        ('User-agent',
                         'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')
                    ]
                    post_data = {'username': user, 'password': pwd}
                    #######################
                    data = self.re_transmission(login_url, browser, post_data)

                    if json.loads(data)['msg'] != 'success':
                        logger.log('force login failed !', flush=True)
                        continue
                    logger.log('force login success !', flush=True)
                    return browser
                if data.find('success') == -1:
                    logger.log("login failed !", flush=True)

                logger.log("login success !", flush=True)
                return browser

            except Exception, e:
                logger.log("login catch exception !", flush=True)
                logger.log(str(e), flush=True)
                pass
Exemple #8
0
    def get_data_by_expr(self, expr, opener, start_day=None, end_day=None):
        #check_login = self.check_login(opener)
        if opener != None:
            logger.log('login is not None !')
        else:
            logger.log('login is None !')
            return None
        begin = 1
        end = 2000
        search_url = "http://vip.cnipr.com/search!doOverviewSearch.action"
        if (start_day == end_day):
            date = time.strftime("%Y%m%d", time.localtime(time.time()))
            search_expr = expr + " and (" + str(start_day) + ")/PD"
        else:
            search_expr = expr + " and (" + str(start_day) + " to " + str(
                end_day) + ")/PD"
        #logger.log(search_expr)
        post_data = {
            'strWhere': search_expr,
            'start': 1,
            'saveFlag': 1,
            'limit': 10,
            'mpage': 'null',
            'channelId': 'SYXX',
            'mpage': 'advsch'
        }

        #######################
        data = self.re_transmission(search_url, opener, post_data, 'FMZL')
        soup = BeautifulSoup(data)
        target = soup.findAll('div', {'class': 'g_item'})
        length = len(target)
        logger.log("length of items:" + str(length), format(True))
        if length < 1:
            logger.log("The search result is None,don't need to scrap !")
            return None
        # g_item
        # logger.log data
        #logger.log str(target)
        download_url = "http://vip.cnipr.com/downloadvip!download2000.action?rd" + str(
            random.random())
        downlist_url = "http://vip.cnipr.com/downloadvip!downloadUserFile.action?rd=" + str(
            random.random())
        post_data = {
            'begin': begin,
            'end': end,
            'strWhere': search_expr,
            'filename': u"著录项批量下载2014401",
            'fields':
            u"申请号;名称;主分类号;分类号;申请(专利权)人;发明(设计)人;公开(公告)日;公开(公告)号;专利代理机构;代理人;申请日;地址;优先权;国省代码;摘要;主权项;国际申请;国际公布;进入国家日期;分案原申请号;权利要求书;法律状态;专利权状态代码",
            'source': 'FMZL,SYXX,WGZL',
            'strSortMethod': u"-公开(公告)日",
            "option": "2",
        }
        try:
            file_url = None
            #######################
            data = self.re_transmission(download_url, opener, post_data)

            json_data = json.loads(data)
            assert json_data['success'] == True, "获取下载项失败"
        except Exception, e:
            logger.log("Get the download file catch exception !", flush=True)
            logger.log(str(e), flush=True)
Exemple #9
0
                    #logger.log("try to visit url:"+url+" success !", flush=True)
                    failed = False
                except Exception, e:
                    failed = True
                    logger.log("try to visit url:" + url + " failed !")
                    logger.log(str(e), flush=True)
                    pass
        else:
            while failed:
                try:
                    data = opener.open(url).read()
                    #logger.log("try to visit url:"+url+" success !", flush=True)
                    failed = False
                except Exception, e:
                    failed = True
                    logger.log("try to visit url:" + url + " failed !")
                    logger.log(str(e), flush=True)
                    pass
        return data

    #登陆
    def login(self, user='', pwd=''):
        try_login = "******" + str(
            random.random())  # 尝试登陆
        login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str(
            random.random())  # 强制登陆
        check_login = "******" + str(
            random.random())
        cnt = 3
        while (cnt > 0):
            cnt -= 1
def scrap(start_day=None, end_day=None, start=1, end=20):
	logger.clear()
	logger.begin(start_day, end_day, start)
	logger.log("Try to get expressions...", flush=True)
	if end != None:
		expressions = expression.objects.filter(id__range=(start, end)).order_by('id')
	else:
		expressions = expression.objects.filter(id__range=(start, 3000)).order_by('id')

	s = spider()
	logger.log("Try to login...", flush=True)
	browser = s.login()
	cnt = 0
	file_path = ''
	for item in expressions:
		cnt += 1
		logger.log(u"第" + str(item.id) + u"个表达式:" + item.name, count=item.id, flush=True)
		#验证是否登录
		check_login = s.check_login(browser)
		if not json.loads(check_login)['success']:
			logger.log('check is not login , sleep 100s ,then try login again')
			time.sleep(100)
			browser = s.login()

		file_path = s.get_xls_by_expression(item.content, browser, start_day, end_day)
		if file_path != None:
			file_path = os.path.normpath(file_path)
			#logger.log(file_path)
			rows = excel_table_byindex(file_path, include_name=False)
			#删除文件
			os.remove(file_path)
			for row in rows:
				# 忽略第一行
				if row == rows[0]:
					continue
				apply_num = row[0]
				# 查重
				p = patent.objects.filter(apply_number=apply_num)
				if len(p) > 0:
					logger.log("{0} update!".format(apply_num))
					p = p[0]
					records = excute_record.objects.filter(expression=item, time_stamp=row[6])
					if len(records) > 0:
						record = records[0]
					else:
						record = excute_record(expression=item, time_stamp=row[6])
						record.save()
					p.record = record
					p.apply_number = row[0]
					p.name = row[1]
					p.main_classify_code = row[2]
					p.classify_code = row[3]
					p.apply_man = row[4]
					p.invente_man = row[5]
					p.publicity_date = row[6]
					p.publicity_code = row[7]
					p.patent_agent = row[8]
					p.agent = row[9]
					p.aplly_date = row[10]
					p.address = row[11]
					p.priority = row[12]
					p.province_code = row[13]
					p.abstract = row[14]
					p.main_right = row[15]
					p.international_apply = row[16]
					p.international_publicity = row[17]
					p.enter_country_date = row[18]
					p.right_demand = row[20]
					p.valid_state = row[21]
					p.state_code = row[22]
					p.type = row[23]
					p.save()
					continue
				logger.log(apply_num)
				#插入纪录
				records = excute_record.objects.filter(expression=item, time_stamp=row[6])  # row[6]==public data #
				if len(records) > 0:
					# logger.log("record already exist !")
					record = records[0]
				else:
					record = excute_record(expression=item, time_stamp=row[6])  # row[6]==public data #
					record.save()
				p = patent(  # 对应的执行记录
				             record=record,

				             # 申请号
				             apply_number=(row[0]),

				             # 名称
				             name=(row[1]),

				             # 主分类号
				             main_classify_code=row[2],

				             #分类号
				             classify_code=row[3],

				             #申请(专利权)人
				             apply_man=row[4],

				             #发明(设计)人
				             invente_man=row[5],

				             #公开(公告)日
				             publicity_date=(row[6]),

				             #公开(公告)号
				             publicity_code=row[7],

				             # 专利代理机构
				             patent_agent=row[8],

				             # 代理人
				             agent=row[9],
				             # 申请日
				             aplly_date=row[10],

				             # 地址
				             address=row[11],

				             # 优先权
				             priority=row[12],

				             # 国省代码
				             province_code=row[13],

				             # 摘要
				             abstract=row[14],

				             # 主权项
				             main_right=row[15],

				             # 国际申请
				             international_apply=row[16],

				             # 国际公布
				             international_publicity=row[17],

				             # 进入国家日期
				             enter_country_date=row[18],
				             # 权利要求书
				             right_demand=row[20],
				             # 法律状态
				             valid_state=row[21],
				             # 专利状态代码
				             state_code=row[22],
				             # 专利类型
				             type=row[23]
				             )
				try:
					p.save()
				except Exception, e:
					logger.log(str(e), flush=True)
					logger.log('failed to save patent!',flush=True)
				             # 摘要
				             abstract=row[14],

				             # 主权项
				             main_right=row[15],

				             # 国际申请
				             international_apply=row[16],

				             # 国际公布
				             international_publicity=row[17],

				             # 进入国家日期
				             enter_country_date=row[18],
				             # 权利要求书
				             right_demand=row[20],
				             # 法律状态
				             valid_state=row[21],
				             # 专利状态代码
				             state_code=row[22],
				             # 专利类型
				             type=row[23]
				             )
				try:
					p.save()
				except Exception, e:
					logger.log(str(e), flush=True)
					logger.log('failed to save patent!',flush=True)

	logger.log("--------Finish---------", flush=True)
	logger.finished()
Exemple #12
0
    def get_data_by_expr(self, expr, opener, start_day=None, end_day=None):
        #check_login = self.check_login(opener)
        if opener != None:
            logger.log('login is not None !')
        else:
            logger.log('login is None !')
            return None
        begin = 1
        end = 2000
        search_url = "http://vip.cnipr.com/search!doOverviewSearch.action"
        if(start_day==end_day):
            date = time.strftime("%Y%m%d", time.localtime(time.time()))
            search_expr = expr + " and ("+str(start_day)+")/PD"
        else:
            search_expr = expr + " and (" + str(start_day) +" to "+str(end_day) +")/PD"
        #logger.log(search_expr)
        post_data = {
            'strWhere': search_expr,
            'start': 1,
            'saveFlag': 1,
            'limit': 10,
            'mpage': 'null',
            'channelId': 'SYXX',
            'mpage': 'advsch'
        }

        #######################
        data = self.re_transmission(search_url, opener, post_data,'FMZL')
        soup = BeautifulSoup(data)
        target = soup.findAll('div', {'class': 'g_item'})
        length = len(target)
        logger.log ("length of items:"+str(length), format(True))
        if length < 1:
            logger.log ("The search result is None,don't need to scrap !")
            return None
        # g_item
        # logger.log data
        #logger.log str(target)
        download_url = "http://vip.cnipr.com/downloadvip!download2000.action?rd" + str(random.random())
        downlist_url = "http://vip.cnipr.com/downloadvip!downloadUserFile.action?rd=" + str(random.random())
        post_data = {'begin': begin, 'end': end,
                     'strWhere': search_expr,
                     'filename': u"著录项批量下载2014401",
                     'fields': u"申请号;名称;主分类号;分类号;申请(专利权)人;发明(设计)人;公开(公告)日;公开(公告)号;专利代理机构;代理人;申请日;地址;优先权;国省代码;摘要;主权项;国际申请;国际公布;进入国家日期;分案原申请号;权利要求书;法律状态;专利权状态代码",
                     'source': 'FMZL,SYXX,WGZL', 'strSortMethod': u"-公开(公告)日", "option": "2",
                     }
        try:
            file_url = None
            #######################
            data = self.re_transmission(download_url, opener, post_data)

            json_data = json.loads(data)
            assert json_data['success'] == True, "获取下载项失败"
        except Exception, e:
            logger.log("Get the download file catch exception !",flush=True)
            logger.log(str(e),flush=True)
Exemple #13
0
    def login(self, user='', pwd=''):
        try_login = "******" + str(random.random())  # 尝试登陆
        login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str(random.random())  # 强制登陆
        check_login = "******" + str(random.random())
        cnt=3
        while(cnt>0):
            cnt-=1
            try:
                cj = cookielib.CookieJar()
                browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
                browser.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')]
                post_data = {'username': user, 'password': pwd}
                #######################
                data = self.re_transmission(try_login,browser,post_data)

                if json.loads(data)['msg'] == 'alreadylogin':
                    logger.log ("account already login !",flush=True)
                    cj = cookielib.CookieJar()
                    browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
                    browser.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')]
                    post_data = {'username': user, 'password': pwd}
                    #######################
                    data = self.re_transmission(login_url,browser,post_data)

                    if json.loads(data)['msg'] != 'success':
                        logger.log('force login failed !',flush=True)
                        continue
                    logger.log('force login success !',flush=True)
                    return browser
                if data.find('success')==-1:
                    logger.log ("login failed !",flush=True)

                logger.log("login success !",flush=True)
                return browser

            except Exception, e:
                logger.log ("login catch exception !",flush=True)
                logger.log(str(e),flush=True)
                pass
Exemple #14
0
                    #logger.log("try to visit url:"+url+" success !", flush=True)
                    failed = False
                except Exception, e:
                    failed = True
                    logger.log("try to visit url:"+url+" failed !")
                    logger.log(str(e), flush=True)
                    pass
        else:
             while failed:
                try:
                    data = opener.open(url).read()
                    #logger.log("try to visit url:"+url+" success !", flush=True)
                    failed = False
                except Exception, e:
                    failed = True
                    logger.log("try to visit url:"+url+" failed !")
                    logger.log(str(e), flush=True)
                    pass
        return data

    #登陆
    def login(self, user='', pwd=''):
        try_login = "******" + str(random.random())  # 尝试登陆
        login_url = "http://vip.cnipr.com/login!goonlogin.action?rd=" + str(random.random())  # 强制登陆
        check_login = "******" + str(random.random())
        cnt=3
        while(cnt>0):
            cnt-=1
            try:
                cj = cookielib.CookieJar()
                browser = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
def scrap(start_day=None, end_day=None, start=1, end=20):
    logger.clear()
    logger.begin(start_day, end_day, start)
    logger.log("Try to get expressions...", flush=True)
    if end != None:
        expressions = expression.objects.filter(id__range=(start,
                                                           end)).order_by('id')
    else:
        expressions = expression.objects.filter(
            id__range=(start, 3000)).order_by('id')

    s = spider()
    logger.log("Try to login...", flush=True)
    browser = s.login()
    cnt = 0
    file_path = ''
    for item in expressions:
        cnt += 1
        logger.log(u"第" + str(item.id) + u"个表达式:" + item.name,
                   count=item.id,
                   flush=True)
        #验证是否登录
        check_login = s.check_login(browser)
        if not json.loads(check_login)['success']:
            logger.log('check is not login , sleep 100s ,then try login again')
            time.sleep(100)
            browser = s.login()

        file_path = s.get_xls_by_expression(item.content, browser, start_day,
                                            end_day)
        if file_path != None:
            file_path = os.path.normpath(file_path)
            #logger.log(file_path)
            rows = excel_table_byindex(file_path, include_name=False)
            #删除文件
            os.remove(file_path)
            for row in rows:
                # 忽略第一行
                if row == rows[0]:
                    continue
                apply_num = row[0]
                # 查重
                p = patent.objects.filter(apply_number=apply_num)
                if len(p) > 0:
                    logger.log("{0} update!".format(apply_num))
                    p = p[0]
                    records = excute_record.objects.filter(expression=item,
                                                           time_stamp=row[6])
                    if len(records) > 0:
                        record = records[0]
                    else:
                        record = excute_record(expression=item,
                                               time_stamp=row[6])
                        record.save()
                    p.record = record
                    p.apply_number = row[0]
                    p.name = row[1]
                    p.main_classify_code = row[2]
                    p.classify_code = row[3]
                    p.apply_man = row[4]
                    p.invente_man = row[5]
                    p.publicity_date = row[6]
                    p.publicity_code = row[7]
                    p.patent_agent = row[8]
                    p.agent = row[9]
                    p.aplly_date = row[10]
                    p.address = row[11]
                    p.priority = row[12]
                    p.province_code = row[13]
                    p.abstract = row[14]
                    p.main_right = row[15]
                    p.international_apply = row[16]
                    p.international_publicity = row[17]
                    p.enter_country_date = row[18]
                    p.right_demand = row[20]
                    p.valid_state = row[21]
                    p.state_code = row[22]
                    p.type = row[23]
                    p.save()
                    continue
                logger.log(apply_num)
                #插入纪录
                records = excute_record.objects.filter(
                    expression=item,
                    time_stamp=row[6])  # row[6]==public data #
                if len(records) > 0:
                    # logger.log("record already exist !")
                    record = records[0]
                else:
                    record = excute_record(
                        expression=item,
                        time_stamp=row[6])  # row[6]==public data #
                    record.save()
                p = patent(  # 对应的执行记录
                    record=record,

                    # 申请号
                    apply_number=(row[0]),

                    # 名称
                    name=(row[1]),

                    # 主分类号
                    main_classify_code=row[2],

                    #分类号
                    classify_code=row[3],

                    #申请(专利权)人
                    apply_man=row[4],

                    #发明(设计)人
                    invente_man=row[5],

                    #公开(公告)日
                    publicity_date=(row[6]),

                    #公开(公告)号
                    publicity_code=row[7],

                    # 专利代理机构
                    patent_agent=row[8],

                    # 代理人
                    agent=row[9],
                    # 申请日
                    aplly_date=row[10],

                    # 地址
                    address=row[11],

                    # 优先权
                    priority=row[12],

                    # 国省代码
                    province_code=row[13],

                    # 摘要
                    abstract=row[14],

                    # 主权项
                    main_right=row[15],

                    # 国际申请
                    international_apply=row[16],

                    # 国际公布
                    international_publicity=row[17],

                    # 进入国家日期
                    enter_country_date=row[18],
                    # 权利要求书
                    right_demand=row[20],
                    # 法律状态
                    valid_state=row[21],
                    # 专利状态代码
                    state_code=row[22],
                    # 专利类型
                    type=row[23])
                try:
                    p.save()
                except Exception, e:
                    logger.log(str(e), flush=True)
                    logger.log('failed to save patent!', flush=True)
                    # 摘要
                    abstract=row[14],

                    # 主权项
                    main_right=row[15],

                    # 国际申请
                    international_apply=row[16],

                    # 国际公布
                    international_publicity=row[17],

                    # 进入国家日期
                    enter_country_date=row[18],
                    # 权利要求书
                    right_demand=row[20],
                    # 法律状态
                    valid_state=row[21],
                    # 专利状态代码
                    state_code=row[22],
                    # 专利类型
                    type=row[23])
                try:
                    p.save()
                except Exception, e:
                    logger.log(str(e), flush=True)
                    logger.log('failed to save patent!', flush=True)

    logger.log("--------Finish---------", flush=True)
    logger.finished()