Beispiel #1
0
    def get_insert_sql(self):
        url=''.join(self['url'])
        url_object_id=''.join(self['url_object_id'])
        if self['title']:
            title=remove_xiexian(''.join(self['title']))
        else:
            title=self['title']
        try:
            upgrade_time=get_finally(self['upgrade_time'])[10:]
        except:
            upgrade_time=0
        try:
            work_perweek=get_per_week(self['work_per_week'])
        except:
            work_perweek=0
        try:
            end_time=get_finally(self['end_time']).replace(',','')
        except:
            end_time=None
        try:
            shixi_time=get_finally(self['shixi_time']).replace('-','')+'个月'
        except:
            shixi_time=None
        try:
            salary=get_salary(self['salary_min'])
            salary_min=salary.split('-')[0] if salary.split('-')[0] else 0
            salary_max=salary.split('-')[1] if salary.split('-')[1] else 0
        except:
            salary_min=0
            salary_max=0
        try:
            company_url=''.join(self['company_url'])
        except:
            company_url=None
        job_city=''.join(self['job_city'])
        degree_need=''.join(self['degree_need'])
        job_addvantage=''.join(self['job_addvantage'])
        job_info=remove_xiexian(''.join(self['job_info']))
        company_name=''.join(self['company_name'])
        try:
            company_url=''.join(self['company_url'])
        except:
            company_url=None
        try:
            work_address=''.join(self['work_address'])
        except:
            work_address=None
        try:
            tags=self['tags'][1]+'-'+self['tags'][2]
            need_nums=self['tags'][1]
        except:
            tags=''.join(self['tags'])
            need_nums=0
        crawl_time=datetime.datetime.now().strftime(SQL_DATE_FORMAT)

        sql="""insert into shixiseng(url,url_object_id,title,upgrade_time,salary_min,salary_max,job_city,degree_need,work_perweek,shixi_time,job_addvantage,job_info,company_name,company_url,work_address,tags,need_nums,end_time,crawl_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
        params=(url,url_object_id,title,upgrade_time,salary_min,salary_max,job_city,degree_need,work_perweek,shixi_time,job_addvantage,job_info,company_name,company_url,work_address,tags,need_nums,end_time,crawl_time)

        return sql,params
Beispiel #2
0
    def get_insert_sql(self):
        #插入表的sql语句
        insert_sql = """insert into article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,fav_nums, praise_nums, tags,content) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)
              ON DUPLICATE KEY UPDATE content=VALUES(content),praise_nums=VALUES(praise_nums),
              tags=VALUES(tags),comment_nums=VALUES(comment_nums)
        """
        title=''.join(self["title"])
        create_date=self["create_date"][0].strftime(SQL_DATE_FORMAT)
        url=''.join(self["url"])
        url_object_id=''.join(self["url_object_id"])
        front_image_url=''.join(self["front_image_url"])
        comment_nums=int(self["comment_nums"][0])
        fav_nums=int(self["fav_nums"][0])
        praise_nums=int(self["praise_nums"][0])
        tags=''.join(self["tags"])
        content=remove_xiexian(''.join(self['content']))
        # front_image_path=self["front_image_path"] if self["front_image_path"] else ''
        params = (
            title, create_date,url,
            url_object_id, front_image_url,'',
            comment_nums, fav_nums,
            praise_nums,tags,content
        )

        return insert_sql, params
Beispiel #3
0
    def get_insert_sql(self):
        if self['salary_min']:
            self['salary_min']=''.join(self['salary_min'])
            salary_min=remove_xiexian(self['salary_min'].split('-')[0]) if self['salary_min'].split('-')[0] else self['salary_min']
            salary_max=remove_xiexian(self['salary_min'].split('-')[1]) if self['salary_min'].split('-')[1] else self['salary_min']
        else:
            salary_min=salary_max=0
        try:
            job_city=remove_xiexian(''.join(self['job_city'])) if self['job_city'] else None
        except:
            job_city=None
        work_years_min=split_years(''.join(self['work_years_min']).split('-')[0]) if ''.join(self['work_years_min']).split('-')[0] else None
        try:
            work_years_max=remove_xiexian(''.join(self['work_years_min']).split('-')[1]) if ''.join(self['work_years_min']).split('-')[1] else None
        except:
            work_years_max=None
        tags=remove_xiexian('-'.join(self['tags']))
        publish_time=''.join(self['publish_time']).split(' ')[0] if ''.join(self['publish_time']).split(' ')[0] else self['publish_time']
        degree_need=remove_xiexian(''.join(self['degree_need']))
        try:
            job_desc=remove_xiexian(''.join(self['job_desc']))
        except:
            job_desc=None
        job_addvantage=''.join(self['job_addvantage'])
        company_name=remove_xiexian(''.join(self['company_name']))
        company_area='-'.join(self['company_area'][:-1])
        company_url=''.join(self['company_url']) if self['company_url'] else None
        company_scale=remove_xiexian(''.join(self['company_scale'])) if self['company_scale'] else None
        company_develop_state=remove_xiexian(''.join(self['company_develop_state']))
        crawl_time=datetime.datetime.now().strftime(SQL_DATE_FORMAT)
        insert_sql="""
                insert into lagou_job(title,url,url_object_id,salary_min,salary_max,job_city,work_years_min,work_years_max,degree_need,work_type,tags,publish_time,job_addvantage,job_desc,company_name,
                company_area,company_develop_state,company_url,company_scale,crawl_time) VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
               on DUPLICATE KEY UPDATE job_addvantage=VALUES(job_addvantage),job_desc=VALUES(job_desc),crawl_time=VALUES(crawl_time)
                """
        params=(
            self['title'],self['url'],self['url_object_id'],salary_min,salary_max,job_city,work_years_min,work_years_max,degree_need,
            self['work_type'],tags,publish_time,job_addvantage,job_desc,company_name,company_area,
            company_develop_state,company_url,company_scale,crawl_time
        )


        return insert_sql,params
Beispiel #4
0
 def parse_detail(self, response):
     import os
     os.getcwd()
     os.chdir(r'E:\data\xiaoshuo')
     xiaoshuo = XiaoshuoItem()
     xiaoshuo['title'] = response.css('.tit1::text').extract()
     xiaoshuo['content'] = response.css('.main').extract()
     title = str(xiaoshuo['title']) + '.txt'
     content = remove_xiexian(''.join(xiaoshuo['content']))
     with open(title, 'wb') as f:
         f.write(content.encode('utf-8'))
     yield xiaoshuo
Beispiel #5
0
    def get_insert_sql(self):
        url=''.join(self['url'])
        url_object_id=self['url_object_id']
        try:
            main_title=remove_xiexian(''.join(self['main_title']))
        except:
            main_title=''
        try:
            title=main_title+remove_xiexian(''.join(self['title']))
        except:
            title=''
        try:
            tags=remove_xiexian(''.join(self['tags']))
        except:
            tags=''
        try:
            score=remove_xiexian(''.join(self['score1'])+''.join(self['score2']))
        except:
            score=0
        try:
            info=remove_xiexian(''.join(self['info']))
        except:
            info=0
        try:
            role='-'.join(self['role'])
        except:
            role=''
        try:
            image_url='--'.join(self['image_url'])
        except:
            image_url=''
        movie_url=self['movie_url']

        insert_sql="""insert into movie(url,url_object_id,title,tags,score,info,role,image_url,movie_url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)
                   on duplicate KEY UPDATE info=VALUE(info) score=VALUE(score)"""
        params=(url,url_object_id,title,tags,score,info,role,image_url,movie_url)

        return insert_sql,params