def import_from_xls(file): rows = excel_table_byindex(file) for row in rows: patent_data = [] cnt = 100 while (cnt > 0): cnt -= 1 patent_data.append(0) for key in row: print key, row[key] if len(row[key]) > 0: patent_data[dic[key]] = row[key] #record = excute_record() temp_patent = patent( # 对应的执行记录 #record=patent_data[1], # 申请号 apply_number=patent_data[2], #名称 name=patent_data[3], #主分类号 main_classify_code=patent_data[4], #分类号 classify_code=patent_data[5], #申请(专利权)人 apply_man=patent_data[6], #发明(设计)人 invente_man=patent_data[7], #公开(公告)日 publicity_date=patent_data[8], #公开(公告)号 publicity_code=patent_data[9], # 专利代理机构 patent_agent=patent_data[10], # 代理人 agent=patent_data[11], # 申请日 aplly_date=patent_data[12], # 地址 address=patent_data[13], # 优先权 priority=patent_data[14], # 国省代码 province_code=patent_data[15], # 摘要 abstract=patent_data[16], # 主权项 main_right=patent_data[17], # 国际申请 international_apply=patent_data[18], # 国际公布 international_publicity=patent_data[19], # 进入国家日期 enter_country_date=patent_data[20], right_demand=patent_data[22], valid_state=patent_data[23], state_code=patent_data[24]) temp_patent.save()
def import_from_xls(file): rows = excel_table_byindex(file) for row in rows: patent_data = [] cnt =100 while (cnt>0): cnt-=1 patent_data.append(0) for key in row: print key, row[key] if len(row[key])>0: patent_data[dic[key]] = row[key] #record = excute_record() temp_patent = patent( # 对应的执行记录 #record=patent_data[1], # 申请号 apply_number=patent_data[2], #名称 name=patent_data[3], #主分类号 main_classify_code=patent_data[4], #分类号 classify_code=patent_data[5], #申请(专利权)人 apply_man=patent_data[6], #发明(设计)人 invente_man=patent_data[7], #公开(公告)日 publicity_date=patent_data[8], #公开(公告)号 publicity_code=patent_data[9], # 专利代理机构 patent_agent=patent_data[10], # 代理人 agent=patent_data[11], # 申请日 aplly_date=patent_data[12], # 地址 address=patent_data[13], # 优先权 priority=patent_data[14], # 国省代码 province_code=patent_data[15], # 摘要 abstract=patent_data[16], # 主权项 main_right=patent_data[17], # 国际申请 international_apply=patent_data[18], # 国际公布 international_publicity=patent_data[19], # 进入国家日期 enter_country_date=patent_data[20], right_demand = patent_data[22], valid_state=patent_data[23], state_code=patent_data[24]) temp_patent.save()
def scrap(start_day=None, end_day=None, start=1, end=20): logger.clear() logger.begin(start_day, end_day, start) logger.log("Try to get expressions...", flush=True) if end != None: expressions = expression.objects.filter(id__range=(start, end)).order_by('id') else: expressions = expression.objects.filter(id__range=(start, 3000)).order_by('id') s = spider() logger.log("Try to login...", flush=True) browser = s.login() cnt = 0 file_path = '' for item in expressions: cnt += 1 logger.log(u"第" + str(item.id) + u"个表达式:" + item.name, count=item.id, flush=True) #验证是否登录 check_login = s.check_login(browser) if not json.loads(check_login)['success']: logger.log('check is not login , sleep 100s ,then try login again') time.sleep(100) browser = s.login() file_path = s.get_xls_by_expression(item.content, browser, start_day, end_day) if file_path != None: file_path = os.path.normpath(file_path) #logger.log(file_path) rows = excel_table_byindex(file_path, include_name=False) #删除文件 os.remove(file_path) for row in rows: # 忽略第一行 if row == rows[0]: continue apply_num = row[0] # 查重 p = patent.objects.filter(apply_number=apply_num) if len(p) > 0: logger.log("{0} update!".format(apply_num)) p = p[0] records = excute_record.objects.filter(expression=item, time_stamp=row[6]) if len(records) > 0: record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) record.save() p.record = record p.apply_number = row[0] p.name = row[1] p.main_classify_code = row[2] p.classify_code = row[3] p.apply_man = row[4] p.invente_man = row[5] p.publicity_date = row[6] p.publicity_code = row[7] p.patent_agent = row[8] p.agent = row[9] p.aplly_date = row[10] p.address = row[11] p.priority = row[12] p.province_code = row[13] p.abstract = row[14] p.main_right = row[15] p.international_apply = row[16] p.international_publicity = row[17] p.enter_country_date = row[18] p.right_demand = row[20] p.valid_state = row[21] p.state_code = row[22] p.type = row[23] p.save() continue logger.log(apply_num) #插入纪录 records = excute_record.objects.filter(expression=item, time_stamp=row[6]) # row[6]==public data # if len(records) > 0: # logger.log("record already exist !") record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) # row[6]==public data # record.save() p = patent( # 对应的执行记录 record=record, # 申请号 apply_number=(row[0]), # 名称 name=(row[1]), # 主分类号 main_classify_code=row[2], #分类号 classify_code=row[3], #申请(专利权)人 apply_man=row[4], #发明(设计)人 invente_man=row[5], #公开(公告)日 publicity_date=(row[6]), #公开(公告)号 publicity_code=row[7], # 专利代理机构 patent_agent=row[8], # 代理人 agent=row[9], # 申请日 aplly_date=row[10], # 地址 address=row[11], # 优先权 priority=row[12], # 国省代码 province_code=row[13], # 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23] ) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!',flush=True)
def begin(): file_path = './test.xls' rows = excel_table_byindex(file_path, include_name=False) for row in rows: # 忽略第一行 if row == rows[0]: continue apply_num = row[0] # 查重 p = patent.objects.filter(apply_number=apply_num) if len(p) > 0: print("{0} update!".format(apply_num)) p = p[0] records = excute_record.objects.filter(expression=item, time_stamp=row[6]) if len(records) > 0: record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) record.save() p.record = record p.apply_number = row[0] p.name = row[1] p.main_classify_code = row[2] p.classify_code = row[3] p.apply_man = row[4] p.invente_man = row[5] p.publicity_date = row[6] p.publicity_code = row[7] p.patent_agent = row[8] p.agent = row[9] p.aplly_date = row[10] p.address = row[11] p.priority = row[12] p.province_code = row[13] p.abstract = row[14] p.main_right = row[15] p.international_apply = row[16] p.international_publicity = row[17] p.enter_country_date = row[18] p.right_demand = row[20] p.valid_state = row[21] p.state_code = row[22] p.type = row[23] p.save() continue logger.log(apply_num) #插入纪录 records = excute_record.objects.filter(expression=item, time_stamp=row[6]) # row[6]==public data # if len(records) > 0: # logger.log("record already exist !") record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) # row[6]==public data # record.save() p = patent( # 对应的执行记录 record=record, # 申请号 apply_number=(row[0]), # 名称 name=(row[1]), # 主分类号 main_classify_code=row[2], #分类号 classify_code=row[3], #申请(专利权)人 apply_man=row[4], #发明(设计)人 invente_man=row[5], #公开(公告)日 publicity_date=(row[6]), #公开(公告)号 publicity_code=row[7], # 专利代理机构 patent_agent=row[8], # 代理人 agent=row[9], # 申请日 aplly_date=row[10], # 地址 address=row[11], # 优先权 priority=row[12], # 国省代码 province_code=row[13], # 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23] ) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!',flush=True)
def scrap(start_day=None, end_day=None, start=1, end=20): logger.clear() logger.begin(start_day, end_day, start) logger.log("Try to get expressions...", flush=True) if end != None: expressions = expression.objects.filter(id__range=(start, end)).order_by('id') else: expressions = expression.objects.filter( id__range=(start, 3000)).order_by('id') s = spider() logger.log("Try to login...", flush=True) browser = s.login() cnt = 0 file_path = '' for item in expressions: cnt += 1 logger.log(u"第" + str(item.id) + u"个表达式:" + item.name, count=item.id, flush=True) #验证是否登录 check_login = s.check_login(browser) if not json.loads(check_login)['success']: logger.log('check is not login , sleep 100s ,then try login again') time.sleep(100) browser = s.login() file_path = s.get_xls_by_expression(item.content, browser, start_day, end_day) if file_path != None: file_path = os.path.normpath(file_path) #logger.log(file_path) rows = excel_table_byindex(file_path, include_name=False) #删除文件 os.remove(file_path) for row in rows: # 忽略第一行 if row == rows[0]: continue apply_num = row[0] # 查重 p = patent.objects.filter(apply_number=apply_num) if len(p) > 0: logger.log("{0} update!".format(apply_num)) p = p[0] records = excute_record.objects.filter(expression=item, time_stamp=row[6]) if len(records) > 0: record = records[0] else: record = excute_record(expression=item, time_stamp=row[6]) record.save() p.record = record p.apply_number = row[0] p.name = row[1] p.main_classify_code = row[2] p.classify_code = row[3] p.apply_man = row[4] p.invente_man = row[5] p.publicity_date = row[6] p.publicity_code = row[7] p.patent_agent = row[8] p.agent = row[9] p.aplly_date = row[10] p.address = row[11] p.priority = row[12] p.province_code = row[13] p.abstract = row[14] p.main_right = row[15] p.international_apply = row[16] p.international_publicity = row[17] p.enter_country_date = row[18] p.right_demand = row[20] p.valid_state = row[21] p.state_code = row[22] p.type = row[23] p.save() continue logger.log(apply_num) #插入纪录 records = excute_record.objects.filter( expression=item, time_stamp=row[6]) # row[6]==public data # if len(records) > 0: # logger.log("record already exist !") record = records[0] else: record = excute_record( expression=item, time_stamp=row[6]) # row[6]==public data # record.save() p = patent( # 对应的执行记录 record=record, # 申请号 apply_number=(row[0]), # 名称 name=(row[1]), # 主分类号 main_classify_code=row[2], #分类号 classify_code=row[3], #申请(专利权)人 apply_man=row[4], #发明(设计)人 invente_man=row[5], #公开(公告)日 publicity_date=(row[6]), #公开(公告)号 publicity_code=row[7], # 专利代理机构 patent_agent=row[8], # 代理人 agent=row[9], # 申请日 aplly_date=row[10], # 地址 address=row[11], # 优先权 priority=row[12], # 国省代码 province_code=row[13], # 摘要 abstract=row[14], # 主权项 main_right=row[15], # 国际申请 international_apply=row[16], # 国际公布 international_publicity=row[17], # 进入国家日期 enter_country_date=row[18], # 权利要求书 right_demand=row[20], # 法律状态 valid_state=row[21], # 专利状态代码 state_code=row[22], # 专利类型 type=row[23]) try: p.save() except Exception, e: logger.log(str(e), flush=True) logger.log('failed to save patent!', flush=True)