def main(): trail_df, weather_df = dg.get_data() merged = dg.merge_dataframes(trail_df, weather_df) # utils.print_heads(merged=merged) # print('merged data') pseudo_clock(merged) pseudo_clock_NN(merged)
def get_answer(topic_id, topic_name): # 不考虑云端数据变化,step小于limit时,取样必然有重叠 time.perf_counter() # 计时 zhi = ZhiHu() func = zhi.questions.answers question_list = load_question(topic_name) total_num = len(question_list) print("[获取回答] ===== 正在准备请求 {} 共有问题数 {} =====".format(topic_name, total_num)) i = 0 answer_all = [] fetch_body = [] for question in question_list: i += 1 question_id = question[0] question_ansnum = int(question[1]) fetch_body.append({"identifier": question_id, "query_args": ["content"], "range": [0, question_ansnum]}) # break print("[获取回答] ===== 正在请求数据 {} 共有问题数 {} =====".format(topic_name, total_num)) res = get_data(fetch_body, func, process_num=PROCESS_NUM, max_process_num=MAX_PROCESS_NUM, flood_discharge_ratio=FLOOD_DISCHARGE_RATIO, floodplain_ratio=FLOODPLAIN_RATIO, headers_pool=HEADERS_POOL) # print(res) print("[获取回答] ===== 正在处理数据 {} 共有问题数 {} =====".format(topic_name, total_num)) i = 0 for question_id, question_result in res.items(): i += 1 answer_list = question_result["data"] if i % 1000 == 0: print("[处理问题 {} / {}]".format(i, total_num), question_id) for item in answer_list: answer_id = item["id"] raw_ans = item["content"] question_content = item["question"]["title"] answer_content = remove_tags(raw_ans) answer_all.append((question_id, answer_id, question_content, answer_content)) print("[获取回答] ===== 正在保存数据 {} 共有问题数 {} =====".format(topic_name, total_num)) file_name = str(topic_name) + "_answers.csv" file_path = os.path.join("./data", file_name) with open(file_path, "a", encoding="utf-8-sig", newline='') as file: writer = csv.writer(file) for item in answer_all: writer.writerows([item])
def get_topic(topic_id, topic_name, topic_ansnum): # 不考虑云端数据变化,step小于limit时,取样必然有重叠 time.perf_counter() # 计时 zhi = ZhiHu() func = zhi.topic.timeline_question print("[获取问题] ===== 正在获取数据 {} 共有回答数 {} =====".format( topic_name, topic_ansnum)) fetch_dict = [{ "identifier": topic_id, "query_args": ["answer_count"], "range": [0, topic_ansnum] }] res = get_data(fetch_dict, func, process_num=PROCESS_NUM, max_process_num=MAX_PROCESS_NUM, flood_discharge_ratio=FLOOD_DISCHARGE_RATIO, floodplain_ratio=FLOODPLAIN_RATIO, headers_pool=HEADERS_POOL) print("[获取问题] ===== 正在处理回答 {} 共有回答数 {} =====".format( topic_name, topic_ansnum)) data = res[topic_id]["data"] num = len(data) question_set = set() for item in data: question_id = item["target"]["id"] question_title = item["target"]["title"] ans_count = item["target"]["answer_count"] if ans_count == 0: continue question_set.add((question_id, ans_count, question_title)) print("[获取问题] ===== 正在保存回答 {} 共有回答数 {} =====".format( topic_name, topic_ansnum)) file_name = str(topic_name) + "_topic.csv" file_path = os.path.join("./data", file_name) with open(file_path, "a", encoding="utf-8-sig", newline='') as file: writer = csv.writer(file) for item in question_set: writer.writerows([item])
# installed libraries import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy.optimize import curve_fit from sklearn.metrics import mean_absolute_error, r2_score from sklearn.preprocessing import StandardScaler, RobustScaler # local files import data_getter import processing plt.style.use('ggplot') x_data, _, data = data_getter.get_data('as7263 mango verbose') print(data.columns) print('(((((((') # x_data = x_data[data['integration time'] == 200] # x_data = x_data[data['position'] == 'pos 2'] # data = data[data['integration time'] == 200] # data = data[data['position'] == 'pos 2'] # chloro_data = data.groupby('Leaf number', as_index=True).mean() data = data.groupby('Leaf number', as_index=True).mean() data_columns = [] for column in data.columns: if 'nm' in column: data_columns.append(column) x_data = data[data_columns] print(data)
# installed libraries import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.metrics import mean_absolute_error, r2_score from sklearn.model_selection import GroupShuffleSplit from sklearn import linear_model # local files import data_getter # plt.xkcd() plt.style.use('seaborn') # data = pd.read_csv('mango_chloro_refl3.csv') x_data, _, full_data = data_getter.get_data('as7262 betal') # full_data = pd.read_csv('mango_flouro_rows.csv') # full_data = pd.read_csv('as7262_mango.csv') # full_data = full_data.groupby('Leaf number', as_index=True).mean() # data = data.loc[(data['integration time'] == 250)] print(full_data.columns) LEDs = ['White LED', 'IR LED', 'UV (405 nm) LED', '390 nm LED', '395 nm LED', '400 nm LED', '405 nm LED', '410 nm LED', '425 nm LED', '455 nm LED', '465 nm LED', '470 nm LED', '475 nm LED', '480 nm LED', '505 nm LED', '525 nm LED', '630 nm LED', '890 nm LED', '940 nm LED'] LED = LEDs[0] data_columns = []
import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.preprocessing import StandardScaler from sklearn import svm from sklearn.datasets import make_moons, make_blobs from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor import data_getter import processing pd.set_option('display.max_columns', 500) pd.set_option('display.max_rows', 500) x_data, y, data = data_getter.get_data('as7262 betal') # x_data = processing.snv(x_data) y = y['Total Chlorophyll (ug/ml)'] print(data.index) results = pd.DataFrame([], index=data.index) print(results) outliers_fraction = 0.1 algorithms = [("Elliptic Envelope", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35,
from sklearn.model_selection import train_test_split, cross_validate from sklearn.preprocessing import QuantileTransformer, PowerTransformer from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.svm import LinearSVR, NuSVR, SVR # local files import data_getter import processing plt.style.use('seaborn') # data = pd.read_csv('as7262_mango.csv') # data = data.groupby('Leaf number', as_index=True).mean() x_data, data = data_getter.get_data("as7262 mango", remove_outlier=True, only_pos2=False) data_columns = [] for column in data.columns: if 'nm' in column: data_columns.append(column) print(data_columns) # x_data = data[data_columns] y_columns = [ 'Total Chlorophyll (ug/ml)', 'Chlorophyll a (ug/ml)', 'Chlorophyll b (ug/ml)' ] invert_y = False
import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression from sklearn.metrics import r2_score from sklearn.model_selection import cross_validate from sklearn.model_selection import ShuffleSplit from sklearn.preprocessing import StandardScaler # local files import data_getter plt.style.use('ggplot') # Prepare data spectrum_data, chloro_data, full_data = data_getter.get_data("as7262 mango") print(spectrum_data.columns) print(chloro_data.columns) print(full_data.columns) class ModelFit(object): def __init__(self): self.test_score = [] self.test_stdev = [] self.train_score = [] self.train_stdev = [] def add_data(self, data): self.test_score.append()
def main(): trail_df, weather_df = dg.get_data() merged = dg.merge_dataframes(trail_df, weather_df) # utils.print_heads(trail_df, weather_df, merged) weather_predictor(merged, True)
# installed libraries import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy.optimize import curve_fit from sklearn.metrics import mean_absolute_error, r2_score from sklearn.preprocessing import StandardScaler, RobustScaler # local files import data_getter import processing plt.style.use('ggplot') x_data, chloro_data, data = data_getter.get_data('as7262 betal') print(data.columns) print('(((((((') # # print(data['integration time'].unique(), data['position'].unique()) # x_data = x_data[data['integration time'] == 200] # x_data = x_data[data['position'] == 'pos 2'] # data = data[data['integration time'] == 200] # data = data[data['position'] == 'pos 2'] # chloro_data = data.groupby('Leaf number', as_index=True).mean() data_columns = [] for column in data.columns: if 'nm' in column: data_columns.append(column) print(data_columns) print(x_data)
import matplotlib.pyplot as plt import numpy as np import pandas as pd # local files import data_getter import processing plt.style.use('seaborn') # data = pd.read_csv("as7262_mango.csv") # data = data.loc[(data['position'] == 'pos 2')] # data = data.loc[(data['integration time'] == 3)] # data = data.groupby('Leaf number', as_index=True).mean() x_data, _, data = data_getter.get_data('as7263 roseapple') chloro_data = data.groupby('Leaf number', as_index=True).mean() accent_column = chloro_data['Total Chlorophyll (ug/ml)'].to_numpy() accent_column = accent_column / max(accent_column) print(accent_column / max(accent_column)) alphas = np.linspace(0.1, 1, 10) colors = np.zeros((chloro_data.shape[0], 4)) colors[:, 0] = 0.2 colors[:, 1] = 0.6 colors[:, 2] = 0.2 colors[:, 3] = accent_column spectrum_data_columns = [] wavelengths = []
from data_getter import get_data import matplotlib.pyplot as plt import networkx as nx LOGIN = '******' PASSWORD = '******' graph_data = get_data(LOGIN, PASSWORD) nodes = [] totals = [] edges = [] for info in graph_data: nodes.append(info['login']) totals.append(info['total']) nodes.append(1) G = nx.Graph() G.add_nodes_from(nodes) ebunches = [] for node, total in zip(nodes, totals): ebunches.append((node, 1, {'weight': total})) G.add_edges_from(ebunches) nx.draw_shell(G, with_labels=True, font_weight='bold') plt.show()
from sklearn.linear_model import PassiveAggressiveRegressor, ridge_regression from sklearn.metrics import make_scorer from sklearn.metrics import mean_absolute_error, r2_score from sklearn.model_selection import learning_curve, ShuffleSplit, KFold from sklearn.model_selection import train_test_split, cross_validate from sklearn.preprocessing import QuantileTransformer, PowerTransformer from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.svm import LinearSVR, NuSVR, SVR # local files import data_getter import processing plt.style.use('ggplot') data, chloro_data = data_getter.get_data('as7263 roseapple') # chloro_data = data.groupby('Leaf number', as_index=True).mean() print(data) print(chloro_data) data_columns = [] print('=====') print(data.columns) for column in data.columns: if 'nm' in column: data_columns.append(column) print(data_columns) x_data = data[data_columns] print(data.columns) y_columns = [ 'Total Chlorophyll (ug/ml)', 'Chlorophyll a (ug/ml)', 'Chlorophyll b (ug/ml)'
if (is_int(inp)) and int(inp) in range(len(lang_lst)): inp = int(inp) lang = dict(enumerate(lang_lst))[inp] projects_lst = get_tag_projects_by_lang(lang, tag_projects, all_projects) title = "Relevant Languages by tag: {tag} and projects regarding the Language: {lang}".format(tag=tag, lang=lang) make_projects_page(title, "Relevant Languages by Tag", lang_lst, projects_lst, all_projects) webbrowser.open('out.htm') break break if __name__ == '__main__': # main loop options = ['Update data', 'Get Popular tags by order', 'Get Popular Programming languages by order'] while True: for j, lang in enumerate(options): print(j, lang) inp = input("\nyour choice:") if (is_int(inp)) and int(inp) in range(len(options)): inp = int(inp) if inp == 0: data_getter.get_data() break elif inp == 1: get_data_by_tag() break elif inp == 2: get_data_by_lang() break