def remove_html_tag():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')

    with db_session:
        print('select items!')

        estates = EstateEntity.select(lambda e: e.status == 1)
        for estate in estates:

            print('===> process: ', estate.url)

            try:
                extract_content = bs(estate.content).text
            except:
                estate.content = '[extract_error]'
                commit()
                print('extract_error')
                continue

            if extract_content:
                estate.content = extract_content
                commit()
                print('done')
            else:
                estate.content = '[no_cleaned_text]'
                commit()
                print('no_cleaned_text')
def filter_seg_freq():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')


    with db_session:
        print('select items!')
        estates = EstateEntity.select().order_by(EstateEntity.id)
        for estate in estates:

            raw_json = estate.seg_freq
            seg_freq = json.loads(raw_json)

            # @todo:

    print('total_seg_freq: ', total_seg_freq)
    print('done')
def export_all_content():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')

    with open(csvfile_path, 'wb') as csvfile:
        print('open csvfile!')
        writer = csv.writer(csvfile)

        print('write table head')
        table_head = ['url', 'website', 'published_at', 'content']
        writer.writerow(table_head)
        with db_session:
            print('select items!')
            for estate in EstateEntity.select().order_by(EstateEntity.id):

                table_row = [estate.url, estate.website, estate.published_at, estate.content.encode('utf-8')]
                writer.writerow(table_row)

                print('id: ', estate.id)
Beispiel #4
0
    def process_item(self, item, spider):

        item_url = item['url']

        with db_session:
            estateEntity = EstateEntity.get(url = item_url)

            if estateEntity:
                print('already have this url item')
                return

            estateEntity = EstateEntity(
                url = item_url,
                published_at = item['published_at'],
                website = item['website'],
                location = item['location'],
                html = item['html']
            )

            print('url: ', item_url)
            print('save post')