def get_all_liepin_info(): """ :return: """ list_job_link = get_liepin_job_link() my_engine = create_mysql_engine("web_crawler") init_liepin_table(my_engine) jobs = get_liepin_table_script(my_engine) # 逐条读取岗位的信息 for current_job_link in list_job_link: nb_row = len( jobs.select(jobs.c.link == current_job_link).execute().fetchall()) if nb_row > 0: print("---> 已经爬取了%s的内容,跳过此岗位" % current_job_link) continue print("---> 正在爬取%s" % current_job_link) current_dataset = get_liepin_job_info(current_job_link) current_dataset.to_sql(name="jobs", con=my_engine, index=False, if_exists="append") sleep(15) my_engine.dispose()
def __init__(self, name, apply_start, apply_end, competition_start, competition_end, is_applied=False, distance=100, is_finished=False): self.mysql_engine = create_mysql_engine("hao_data_base_structure") self.meta_data = MetaData(self.mysql_engine) self.table_script = init_marathon_table(self.meta_data) self.name = name self.apply_start = apply_start self.apply_end = apply_end self.is_applied = is_applied self.distance = distance self.competition_start = competition_start self.competition_end = competition_end self.is_finished = is_finished self.insert_into_database()
def calculate_matrix_distance(matrix_tf_idf): """ :param matrix_tf_idf: :return:从tf-idf矩阵出发计算两两文档之间的相似性 """ nb_row, nb_column = np.shape(matrix_tf_idf) list_object_first = list([]) list_object_second = list([]) list_distance = list([]) for first_object in range(nb_row): for second_object in range(first_object + 1, nb_row, 1): distance = 0 for current_column in range(nb_column): distance = distance + pow( matrix_tf_idf[first_object, current_column] - matrix_tf_idf[second_object, current_column], 2) distance = np.sqrt(distance) list_object_first.append(first_object + 1) list_object_second.append(second_object + 1) list_distance.append(round(distance, 4)) print("%d ---> %d ---> %f" % (first_object, second_object, distance)) result = pd.DataFrame({ 'source': list_object_first, 'target': list_object_second, 'distance': list_distance }) my_engine = create_mysql_engine("web_crawler") init_similarity_table(my_engine) result.to_sql(name='news_similarity', con=my_engine, index=False, if_exists='append') my_engine.dispose() return result
# 作者:hao.ren3 # 时间:2019/10/8 17:49 # IDE:PyCharm if __name__ == '__main__': from sqlalchemy import Table, MetaData, Integer, Text, Column from python_scraping.Hao_Test.tools.sql import create_mysql_engine from python_scraping.Hao_Test.word.about_word import jieba_cut from gensim.models import Word2Vec my_engine = create_mysql_engine("web_crawler") meta_data = MetaData(my_engine) news_table = Table( "news", meta_data, Column('id', Integer, primary_key=True, autoincrement=True), Column('article', Text)) data = news_table.select(news_table.c.id == 1).execute().fetchone() article = data[1] list_words = jieba_cut(article) model = Word2Vec(list_words, size=100, window=10, min_count=3) my_engine.dispose()
def create_data_base_engine(): inside_my_engine = create_mysql_engine("mysql") sql = "create database IF NOT EXISTS hao_data_base_structure DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci" inside_my_engine.execute(sql) inside_my_engine.dispose() return create_mysql_engine("hao_data_base_structure")
# 作者:hao.ren3 # 时间:2019/10/16 18:25 # IDE:PyCharm from python_scraping.Hao_Test.tools.sql import create_mysql_engine from python_scraping.Hao_Test.data_base_structure.init_table import init_data_column_table, init_data_base_table, init_data_table_table from sqlalchemy import MetaData from sqlalchemy.orm.session import sessionmaker if __name__ == "__main__": my_engine = create_mysql_engine("hao_data_base_structure") my_meta_data = MetaData(my_engine) Session = sessionmaker(bind=my_engine) session = Session() table_data_base = init_data_base_table(mysql_meta_data=my_meta_data) table_data_table = init_data_table_table(mysql_meta_data=my_meta_data) table_data_column = init_data_column_table(mysql_meta_data=my_meta_data) test = (table_data_column.select().join( table_data_table, table_data_column.c.data_table_id == table_data_table.c.id).join( table_data_base, table_data_column.c.data_base_id == table_data_base.c.id)) test = (session.query( table_data_base.c.name, table_data_table.c.name, table_data_column.c.name).join( table_data_table, table_data_column.c.data_table_id == table_data_table.c.id).join( table_data_base, table_data_column.c.data_base_id ==