def get_all_liepin_info():
    """
    :return:
    """
    list_job_link = get_liepin_job_link()

    my_engine = create_mysql_engine("web_crawler")
    init_liepin_table(my_engine)

    jobs = get_liepin_table_script(my_engine)
    # 逐条读取岗位的信息
    for current_job_link in list_job_link:
        nb_row = len(
            jobs.select(jobs.c.link == current_job_link).execute().fetchall())
        if nb_row > 0:
            print("---> 已经爬取了%s的内容,跳过此岗位" % current_job_link)
            continue
        print("---> 正在爬取%s" % current_job_link)
        current_dataset = get_liepin_job_info(current_job_link)
        current_dataset.to_sql(name="jobs",
                               con=my_engine,
                               index=False,
                               if_exists="append")
        sleep(15)
    my_engine.dispose()
 def __init__(self,
              name,
              apply_start,
              apply_end,
              competition_start,
              competition_end,
              is_applied=False,
              distance=100,
              is_finished=False):
     self.mysql_engine = create_mysql_engine("hao_data_base_structure")
     self.meta_data = MetaData(self.mysql_engine)
     self.table_script = init_marathon_table(self.meta_data)
     self.name = name
     self.apply_start = apply_start
     self.apply_end = apply_end
     self.is_applied = is_applied
     self.distance = distance
     self.competition_start = competition_start
     self.competition_end = competition_end
     self.is_finished = is_finished
     self.insert_into_database()
def calculate_matrix_distance(matrix_tf_idf):
    """
    :param matrix_tf_idf:
    :return:从tf-idf矩阵出发计算两两文档之间的相似性
    """
    nb_row, nb_column = np.shape(matrix_tf_idf)

    list_object_first = list([])
    list_object_second = list([])
    list_distance = list([])

    for first_object in range(nb_row):
        for second_object in range(first_object + 1, nb_row, 1):
            distance = 0
            for current_column in range(nb_column):
                distance = distance + pow(
                    matrix_tf_idf[first_object, current_column] -
                    matrix_tf_idf[second_object, current_column], 2)
            distance = np.sqrt(distance)
            list_object_first.append(first_object + 1)
            list_object_second.append(second_object + 1)
            list_distance.append(round(distance, 4))
            print("%d ---> %d ---> %f" %
                  (first_object, second_object, distance))
    result = pd.DataFrame({
        'source': list_object_first,
        'target': list_object_second,
        'distance': list_distance
    })
    my_engine = create_mysql_engine("web_crawler")
    init_similarity_table(my_engine)
    result.to_sql(name='news_similarity',
                  con=my_engine,
                  index=False,
                  if_exists='append')
    my_engine.dispose()
    return result
# 作者:hao.ren3
# 时间:2019/10/8 17:49
# IDE:PyCharm

if __name__ == '__main__':
    from sqlalchemy import Table, MetaData, Integer, Text, Column
    from python_scraping.Hao_Test.tools.sql import create_mysql_engine
    from python_scraping.Hao_Test.word.about_word import jieba_cut
    from gensim.models import Word2Vec

    my_engine = create_mysql_engine("web_crawler")
    meta_data = MetaData(my_engine)
    news_table = Table(
        "news", meta_data,
        Column('id', Integer, primary_key=True, autoincrement=True),
        Column('article', Text))

    data = news_table.select(news_table.c.id == 1).execute().fetchone()
    article = data[1]

    list_words = jieba_cut(article)
    model = Word2Vec(list_words, size=100, window=10, min_count=3)

    my_engine.dispose()
def create_data_base_engine():
    inside_my_engine = create_mysql_engine("mysql")
    sql = "create database IF NOT EXISTS hao_data_base_structure DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci"
    inside_my_engine.execute(sql)
    inside_my_engine.dispose()
    return create_mysql_engine("hao_data_base_structure")
Beispiel #6
0
# 作者:hao.ren3
# 时间:2019/10/16 18:25
# IDE:PyCharm

from python_scraping.Hao_Test.tools.sql import create_mysql_engine
from python_scraping.Hao_Test.data_base_structure.init_table import init_data_column_table, init_data_base_table, init_data_table_table
from sqlalchemy import MetaData
from sqlalchemy.orm.session import sessionmaker

if __name__ == "__main__":
    my_engine = create_mysql_engine("hao_data_base_structure")
    my_meta_data = MetaData(my_engine)
    Session = sessionmaker(bind=my_engine)
    session = Session()

    table_data_base = init_data_base_table(mysql_meta_data=my_meta_data)
    table_data_table = init_data_table_table(mysql_meta_data=my_meta_data)
    table_data_column = init_data_column_table(mysql_meta_data=my_meta_data)

    test = (table_data_column.select().join(
        table_data_table,
        table_data_column.c.data_table_id == table_data_table.c.id).join(
            table_data_base,
            table_data_column.c.data_base_id == table_data_base.c.id))

    test = (session.query(
        table_data_base.c.name, table_data_table.c.name,
        table_data_column.c.name).join(
            table_data_table,
            table_data_column.c.data_table_id == table_data_table.c.id).join(
                table_data_base, table_data_column.c.data_base_id ==