def import_and_clean(): """This function puts together a few of the cleaning functions from data_clean.py""" # data_import.py performs our test-train split and writes four csvs, which we read in here X_train_temp = pd.read_csv('./data/dirty_X_train.csv', index_col=0) y_train_temp = pd.read_csv('./data/dirty_y_train.csv', index_col=0) X_test_temp = pd.read_csv('./data/dirty_X_test.csv', index_col=0) y_test_temp = pd.read_csv('./data/dirty_y_test.csv', index_col=0) # These perfrom basic data cleaning X_train, y_train = dc.data_clean(X_train_temp, y_train_temp) X_test, y_test = dc.data_clean(X_test_temp, y_test_temp) # these create full sets of test and train data with FIPS county codes for plots X_train, X_test, full_data = dc.create_fips_df(X_train, X_test) y_train, y_test, full_target = dc.create_fips_df(y_train, y_test) return X_train, X_test, y_train, y_test, full_data, full_target
def get_info(url): global avg_salary html = urlopen(url).read().decode('GBK') soup = BeautifulSoup(html, "html.parser") # 获取职位信息 titles = soup.select("p[class='t1'] a") # 获取工作地点 di = soup.select("span[class='t3']") # 获取公司 company = soup.select("span[class='t2']") # 获取薪水信息 salaries = soup.select("span[class='t4']") # CSS 选择器 # 获取发布时间 time = soup.select("span[class='t5']") for i in range(len(titles)): # with open("1.txt","a") as f: # f.write(titles[i].get('title')) # f.write(salaries[i+1].get_text()) # f.write("\n") # print("{:30}{}{}".format(titles[i].get('title'),salaries[i+1].get_text(),di[i+1].get_text()),company[i+1].get_text()) if ("Python" or "python" ) in titles[i].get('title') and "开发工程师" in titles[i].get('title'): # 数据清洗完成单位转换 m = data_clean(salaries[i + 1].get_text()) # 筛选北京地区Python开发工程师 if "北京" in di[i + 1].get_text(): avg_salary = calculate_salary(m) session = creat_session() try: obj = data_table(POSITION=titles[i].get('title'), COMPANY=company[i + 1].get_text(), ADDRESS=di[i + 1].get_text(), SALARY=m, DATE=time[i + 1].get_text()) # 生成数据对象 session.add(obj) # 把要创建的数据对象添加到session里 session.commit() except: continue
import os import numpy as np import pandas as pd import time import threading import traceback import datetime import queue import codecs import get_data get_data = get_data.get_data() import data_clean data_clean = data_clean.data_clean() import data_output data_output = data_output.data_output() #import set_log #log_obj = set_log.Logger('main_proess.log', set_log.logging.WARNING, # set_log.logging.DEBUG) #log_obj.cleanup('main_proess.log', if_cleanup = True) # 是否需要在每次运行程序前清空Log文件 class bg_data_main_proess(threading.Thread): def __init__(self, args_queue, lock): threading.Thread.__init__(self)
from data_get import data_get from data_clean import data_clean from data_preprocessing import data_preprocessing stocks_list = [] with open("stocks_list.txt", "r") as f: data = f.readlines() for i in range(len(data)): data[i] = data[i].replace("\n", "") stocks_list.extend(data) # data_get(stocks_list) data_clean() data_preprocessing()
# Change the phrases in the the file to get the prediction. # Multinomial NB classifer import random_predict as rp import data_clean as dc phrase = input("Enter the your question.. : ") result_str = dc.data_clean(phrase).return_str() result = rp.random_predict(result_str)
import data_clean as dc str_data = "Send my car Insurance Card" result = dc.data_clean(str_data).return_str() print(result)
def __init__(self): super(model_1, self).__init__() self.dc = dc.data_clean()