def remove_html_tag():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')

    with db_session:
        print('select items!')

        estates = EstateEntity.select(lambda e: e.status == 1)
        for estate in estates:

            print('===> process: ', estate.url)

            try:
                extract_content = bs(estate.content).text
            except:
                estate.content = '[extract_error]'
                commit()
                print('extract_error')
                continue

            if extract_content:
                estate.content = extract_content
                commit()
                print('done')
            else:
                estate.content = '[no_cleaned_text]'
                commit()
                print('no_cleaned_text')
def extract_content_with_readability():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')

    with db_session:
        print('select items!')

        estates = EstateQ2Entity.select(lambda e: e.content == '[no_cleaned_text]')
        for estate in estates:

            print('===> process: ', estate.url)

            try:
                extract_content = Document(estate.html).summary()
            except:
                estate.content = '[extract_error]'
                commit()
                print('extract_error')
                continue

            if extract_content:
                estate.content = extract_content
                commit()
                print('done')
            else:
                estate.content = '[no_cleaned_text]'
                commit()
                print('no_cleaned_text')
def mark_topic():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')

    with db_session:
        print('select items!')

        estates = EstateQ2Entity.select(lambda e: e.topic == None).order_by(EstateQ2Entity.id)
        for estate in estates:

            raw_json = estate.seg_freq
            seg_freq = json.loads(raw_json)
            
            topic = 0

            if check_topic1(seg_freq):
                topic = topic | TOPIC1

            if check_topic2(seg_freq):
                topic = topic | TOPIC2

            if check_topic3(seg_freq):
                topic = topic | TOPIC3

            if check_topic4(seg_freq):
                topic = topic | TOPIC4

            if check_topic5(seg_freq):
                topic = topic | TOPIC5

            if check_topic6(seg_freq):
                topic = topic | TOPIC6

            if check_topic7(seg_freq):
                topic = topic | TOPIC7

            if check_topic8(seg_freq):
                topic = topic | TOPIC8

            if check_topic9(seg_freq):
                topic = topic | TOPIC9

            if check_topic10(seg_freq):
                topic = topic | TOPIC10

            if check_topic11(seg_freq):
                topic = topic | TOPIC11


            estate.topic = topic
            commit()

            if topic != 0:
                print('!!! topic: ', topic)

            print('id: ', estate.id)
def filter_seg_freq():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')


    with db_session:
        print('select items!')
        estates = EstateEntity.select().order_by(EstateEntity.id)
        for estate in estates:

            raw_json = estate.seg_freq
            seg_freq = json.loads(raw_json)

            # @todo:

    print('total_seg_freq: ', total_seg_freq)
    print('done')
def export_all_content():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')

    with open(csvfile_path, 'wb') as csvfile:
        print('open csvfile!')
        writer = csv.writer(csvfile)

        print('write table head')
        table_head = ['url', 'website', 'published_at', 'content']
        writer.writerow(table_head)
        with db_session:
            print('select items!')
            for estate in EstateEntity.select().order_by(EstateEntity.id):

                table_row = [estate.url, estate.website, estate.published_at, estate.content.encode('utf-8')]
                writer.writerow(table_row)

                print('id: ', estate.id)
def export_csv():
    db.bind('mysql', **settings.get('DB'))
    db.generate_mapping()
    print('connect to db!')

    with open(csvfile_path, 'wb') as csvfile:
        print('open csvfile!')
        writer = csv.writer(csvfile)

        print('write table head')

        # url, website, location, published_at, content, seg_freq, topic
        table_head = ['url', 'website', 'location', 'published_at', 'seg_freq', 'topic']
        writer.writerow(table_head)
        with db_session:
            print('select items!')
            for estate in EstateQ2Entity.select().order_by(EstateQ2Entity.id):

                table_row = [estate.url, estate.website, estate.location, estate.published_at, estate.seg_freq, estate.topic]
                writer.writerow(table_row)
                print('id: ', estate.id)
def items_segment():
    db.bind("mysql", **settings.get("DB"))
    db.generate_mapping()
    print("connect to db!")

    with db_session:
        print("select items!")

        estates = EstateQ2Entity.select(lambda e: e.content != None).order_by(EstateQ2Entity.id)
        for estate in estates:

            if estate.content:
                try:
                    seg_freq_res = segment(estate.content)
                except:
                    estate.status = 2
                    commit()
                    continue

                estate.seg_freq = seg_freq_res
                commit()
                print("==> id: ", estate.id)
Exemple #8
0
 def __init__(self):
     db.bind('mysql', **settings.get('DB'))
     db.generate_mapping()
def extract_content():

    if len(sys.argv) >= 3:
        start_id = int(sys.argv[1])
        end_id = int(sys.argv[2])
        print("start_id: ", start_id, "  end_id: ", end_id)
    else:
        start_id = 0
        end_id = 0

    db.bind("mysql", **settings.get("DB"))
    db.generate_mapping()
    print("connect to db!")

    goose = Goose({"stopwords_class": StopWordsChinese})

    with db_session:

        if start_id != 0:
            print("select items!")
            for estate in EstateQ2Entity.select(lambda e: e.content is None and e.id >= start_id and e.id < end_id):

                print("===> process: ", estate.url)

                try:
                    extract_content = goose.extract(raw_html=estate.html)
                except:
                    estate.content = "[extract_error]"
                    commit()
                    print("extract_error")
                    continue

                if extract_content.cleaned_text:
                    estate.content = extract_content.cleaned_text
                    commit()
                    print("done")
                else:
                    estate.content = "[no_cleaned_text]"
                    commit()
                    print("no_cleaned_text")

        else:
            print("select items!")
            for estate in EstateQ2Entity.select(lambda e: e.content is None):

                print("===> process: ", estate.url)

                try:
                    extract_content = goose.extract(raw_html=estate.html)
                except:
                    estate.content = "[extract_error]"
                    commit()
                    print("extract_error")
                    continue

                if extract_content.cleaned_text:
                    estate.content = extract_content.cleaned_text
                    commit()
                    print("done")
                else:
                    estate.content = "[no_cleaned_text]"
                    commit()
                    print("no_cleaned_text")