def get_insert_sql(self): url=''.join(self['url']) url_object_id=''.join(self['url_object_id']) if self['title']: title=remove_xiexian(''.join(self['title'])) else: title=self['title'] try: upgrade_time=get_finally(self['upgrade_time'])[10:] except: upgrade_time=0 try: work_perweek=get_per_week(self['work_per_week']) except: work_perweek=0 try: end_time=get_finally(self['end_time']).replace(',','') except: end_time=None try: shixi_time=get_finally(self['shixi_time']).replace('-','')+'个月' except: shixi_time=None try: salary=get_salary(self['salary_min']) salary_min=salary.split('-')[0] if salary.split('-')[0] else 0 salary_max=salary.split('-')[1] if salary.split('-')[1] else 0 except: salary_min=0 salary_max=0 try: company_url=''.join(self['company_url']) except: company_url=None job_city=''.join(self['job_city']) degree_need=''.join(self['degree_need']) job_addvantage=''.join(self['job_addvantage']) job_info=remove_xiexian(''.join(self['job_info'])) company_name=''.join(self['company_name']) try: company_url=''.join(self['company_url']) except: company_url=None try: work_address=''.join(self['work_address']) except: work_address=None try: tags=self['tags'][1]+'-'+self['tags'][2] need_nums=self['tags'][1] except: tags=''.join(self['tags']) need_nums=0 crawl_time=datetime.datetime.now().strftime(SQL_DATE_FORMAT) sql="""insert into shixiseng(url,url_object_id,title,upgrade_time,salary_min,salary_max,job_city,degree_need,work_perweek,shixi_time,job_addvantage,job_info,company_name,company_url,work_address,tags,need_nums,end_time,crawl_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" params=(url,url_object_id,title,upgrade_time,salary_min,salary_max,job_city,degree_need,work_perweek,shixi_time,job_addvantage,job_info,company_name,company_url,work_address,tags,need_nums,end_time,crawl_time) return sql,params
def get_insert_sql(self): #插入表的sql语句 insert_sql = """insert into article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,fav_nums, praise_nums, tags,content) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s) ON DUPLICATE KEY UPDATE content=VALUES(content),praise_nums=VALUES(praise_nums), tags=VALUES(tags),comment_nums=VALUES(comment_nums) """ title=''.join(self["title"]) create_date=self["create_date"][0].strftime(SQL_DATE_FORMAT) url=''.join(self["url"]) url_object_id=''.join(self["url_object_id"]) front_image_url=''.join(self["front_image_url"]) comment_nums=int(self["comment_nums"][0]) fav_nums=int(self["fav_nums"][0]) praise_nums=int(self["praise_nums"][0]) tags=''.join(self["tags"]) content=remove_xiexian(''.join(self['content'])) # front_image_path=self["front_image_path"] if self["front_image_path"] else '' params = ( title, create_date,url, url_object_id, front_image_url,'', comment_nums, fav_nums, praise_nums,tags,content ) return insert_sql, params
def get_insert_sql(self): if self['salary_min']: self['salary_min']=''.join(self['salary_min']) salary_min=remove_xiexian(self['salary_min'].split('-')[0]) if self['salary_min'].split('-')[0] else self['salary_min'] salary_max=remove_xiexian(self['salary_min'].split('-')[1]) if self['salary_min'].split('-')[1] else self['salary_min'] else: salary_min=salary_max=0 try: job_city=remove_xiexian(''.join(self['job_city'])) if self['job_city'] else None except: job_city=None work_years_min=split_years(''.join(self['work_years_min']).split('-')[0]) if ''.join(self['work_years_min']).split('-')[0] else None try: work_years_max=remove_xiexian(''.join(self['work_years_min']).split('-')[1]) if ''.join(self['work_years_min']).split('-')[1] else None except: work_years_max=None tags=remove_xiexian('-'.join(self['tags'])) publish_time=''.join(self['publish_time']).split(' ')[0] if ''.join(self['publish_time']).split(' ')[0] else self['publish_time'] degree_need=remove_xiexian(''.join(self['degree_need'])) try: job_desc=remove_xiexian(''.join(self['job_desc'])) except: job_desc=None job_addvantage=''.join(self['job_addvantage']) company_name=remove_xiexian(''.join(self['company_name'])) company_area='-'.join(self['company_area'][:-1]) company_url=''.join(self['company_url']) if self['company_url'] else None company_scale=remove_xiexian(''.join(self['company_scale'])) if self['company_scale'] else None company_develop_state=remove_xiexian(''.join(self['company_develop_state'])) crawl_time=datetime.datetime.now().strftime(SQL_DATE_FORMAT) insert_sql=""" insert into lagou_job(title,url,url_object_id,salary_min,salary_max,job_city,work_years_min,work_years_max,degree_need,work_type,tags,publish_time,job_addvantage,job_desc,company_name, company_area,company_develop_state,company_url,company_scale,crawl_time) VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on DUPLICATE KEY UPDATE job_addvantage=VALUES(job_addvantage),job_desc=VALUES(job_desc),crawl_time=VALUES(crawl_time) """ params=( self['title'],self['url'],self['url_object_id'],salary_min,salary_max,job_city,work_years_min,work_years_max,degree_need, self['work_type'],tags,publish_time,job_addvantage,job_desc,company_name,company_area, company_develop_state,company_url,company_scale,crawl_time ) return insert_sql,params
def parse_detail(self, response): import os os.getcwd() os.chdir(r'E:\data\xiaoshuo') xiaoshuo = XiaoshuoItem() xiaoshuo['title'] = response.css('.tit1::text').extract() xiaoshuo['content'] = response.css('.main').extract() title = str(xiaoshuo['title']) + '.txt' content = remove_xiexian(''.join(xiaoshuo['content'])) with open(title, 'wb') as f: f.write(content.encode('utf-8')) yield xiaoshuo
def get_insert_sql(self): url=''.join(self['url']) url_object_id=self['url_object_id'] try: main_title=remove_xiexian(''.join(self['main_title'])) except: main_title='' try: title=main_title+remove_xiexian(''.join(self['title'])) except: title='' try: tags=remove_xiexian(''.join(self['tags'])) except: tags='' try: score=remove_xiexian(''.join(self['score1'])+''.join(self['score2'])) except: score=0 try: info=remove_xiexian(''.join(self['info'])) except: info=0 try: role='-'.join(self['role']) except: role='' try: image_url='--'.join(self['image_url']) except: image_url='' movie_url=self['movie_url'] insert_sql="""insert into movie(url,url_object_id,title,tags,score,info,role,image_url,movie_url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate KEY UPDATE info=VALUE(info) score=VALUE(score)""" params=(url,url_object_id,title,tags,score,info,role,image_url,movie_url) return insert_sql,params