def import_and_clean():
    """This function puts together a few of the cleaning functions from data_clean.py"""
    # data_import.py performs our test-train split and writes four csvs, which we read in here
    X_train_temp = pd.read_csv('./data/dirty_X_train.csv', index_col=0)
    y_train_temp = pd.read_csv('./data/dirty_y_train.csv', index_col=0)
    X_test_temp = pd.read_csv('./data/dirty_X_test.csv', index_col=0)
    y_test_temp = pd.read_csv('./data/dirty_y_test.csv', index_col=0)

    # These perfrom basic data cleaning
    X_train, y_train = dc.data_clean(X_train_temp, y_train_temp)
    X_test, y_test = dc.data_clean(X_test_temp, y_test_temp)

    # these create full sets of test and train data with FIPS county codes for plots
    X_train, X_test, full_data = dc.create_fips_df(X_train, X_test)
    y_train, y_test, full_target = dc.create_fips_df(y_train, y_test)

    return X_train, X_test, y_train, y_test, full_data, full_target
Esempio n. 2
0
def get_info(url):
    global avg_salary

    html = urlopen(url).read().decode('GBK')

    soup = BeautifulSoup(html, "html.parser")
    # 获取职位信息
    titles = soup.select("p[class='t1'] a")
    # 获取工作地点
    di = soup.select("span[class='t3']")
    # 获取公司
    company = soup.select("span[class='t2']")
    # 获取薪水信息
    salaries = soup.select("span[class='t4']")  # CSS 选择器
    # 获取发布时间
    time = soup.select("span[class='t5']")

    for i in range(len(titles)):
        # with open("1.txt","a") as f:

        #     f.write(titles[i].get('title'))
        #     f.write(salaries[i+1].get_text())
        #     f.write("\n")
        # print("{:30}{}{}".format(titles[i].get('title'),salaries[i+1].get_text(),di[i+1].get_text()),company[i+1].get_text())
        if ("Python" or "python"
            ) in titles[i].get('title') and "开发工程师" in titles[i].get('title'):
            # 数据清洗完成单位转换

            m = data_clean(salaries[i + 1].get_text())
            # 筛选北京地区Python开发工程师
            if "北京" in di[i + 1].get_text():
                avg_salary = calculate_salary(m)

            session = creat_session()
            try:

                obj = data_table(POSITION=titles[i].get('title'),
                                 COMPANY=company[i + 1].get_text(),
                                 ADDRESS=di[i + 1].get_text(),
                                 SALARY=m,
                                 DATE=time[i + 1].get_text())  # 生成数据对象
                session.add(obj)  # 把要创建的数据对象添加到session里
                session.commit()
            except:
                continue
Esempio n. 3
0
import os
import numpy as np
import pandas as pd
import time
import threading
import traceback
import datetime
import queue
import codecs


import get_data
get_data = get_data.get_data()

import data_clean
data_clean = data_clean.data_clean()

import data_output
data_output = data_output.data_output()

#import set_log  

#log_obj = set_log.Logger('main_proess.log', set_log.logging.WARNING,
#                         set_log.logging.DEBUG)
#log_obj.cleanup('main_proess.log', if_cleanup = True)  # 是否需要在每次运行程序前清空Log文件


class bg_data_main_proess(threading.Thread):

    def __init__(self, args_queue, lock):
        threading.Thread.__init__(self)
from data_get import data_get
from data_clean import data_clean
from data_preprocessing import data_preprocessing

stocks_list = []
with open("stocks_list.txt", "r") as f:
    data = f.readlines()
    for i in range(len(data)):
        data[i] = data[i].replace("\n", "")
    stocks_list.extend(data)

# data_get(stocks_list)
data_clean()
data_preprocessing()
Esempio n. 5
0
# Change the phrases in the the file to get the prediction.
# Multinomial NB classifer
import random_predict as rp
import data_clean as dc

phrase = input("Enter the your question.. : ")

result_str = dc.data_clean(phrase).return_str()

result = rp.random_predict(result_str)




Esempio n. 6
0
import data_clean as dc

str_data = "Send my car Insurance Card"

result = dc.data_clean(str_data).return_str()

print(result)
Esempio n. 7
0
	def __init__(self):
		super(model_1, self).__init__()
		self.dc = dc.data_clean()