def read_weather(): path_1 = "Receipt/気象情報/札幌(天気【2016年4月1日~2017年3月30日】).csv" path_3 = "Receipt/気象情報/東京(天気【2016年4月1日~2017年3月30日】).csv" path_6 = "Receipt/気象情報/大阪(天気【2016年4月1日~2017年3月30日】).csv" path_9 = "Receipt/気象情報/福岡(天気【2016年4月1日~2017年3月30日】).csv" tenki_filepath_1 = os.path.join(common.data_dir, path_1) tenki_filepath_3 = os.path.join(common.data_dir, path_3) tenki_filepath_6 = os.path.join(common.data_dir, path_6) tenki_filepath_9 = os.path.join(common.data_dir, path_9) tenki_1 = common.read_csv(tenki_filepath_1, parse_dates=["年月日時"], skiprows=3) tenki_3 = common.read_csv(tenki_filepath_3, parse_dates=["年月日時"], skiprows=3) tenki_6 = common.read_csv(tenki_filepath_6, parse_dates=["年月日時"], skiprows=3) tenki_9 = common.read_csv(tenki_filepath_9, parse_dates=["年月日時"], skiprows=3) tenki_1 = tenki_1[["年月日時", "天気"]].rename(columns={ "天気": "天気_num_北海道" }).drop(0) tenki_3 = tenki_3[["天気"]].rename(columns={"天気": "天気_num_東エリア"}).drop(0) tenki_6 = tenki_6[["天気"]].rename(columns={"天気": "天気_num_西エリア"}).drop(0) tenki_9 = tenki_9[["天気"]].rename(columns={"天気": "天気_num_九州"}).drop(0) tenki = pd.concat([tenki_1, tenki_3, tenki_6, tenki_9], axis=1) return tenki
def main(db_file: str, ddl_script: str, place_cache: str, time_to_wro: str, offers_path: str, inet_curr: str, inet_popc: str): setup_log() log = getLogger() db_conn: sqlite3.Connection = sqlite3.connect(db_file) _init_tables(db_conn, ddl_script) log.info(f"Inserting Places cache {place_cache} into sqlite3 DB {db_file}...") places = filter(None, map(Place.from_csv_row, read_csv(place_cache))) for places_chunk in chunked(map(lambda p: p.to_sql_row(), places), 50): try: c = db_conn.cursor() c.executemany("INSERT INTO place VALUES (?,?,?,?,?)", places_chunk) c.close() db_conn.commit() except sqlite3.IntegrityError as e: log.warning(f"Could not insert {len(places_chunk)} rows [{places_chunk[0]}, ..., {places_chunk[-1]}]: {e}") log.info(f"Inserting time-to-wroclaw data from {time_to_wro} into DB...") time_to_wro_iter = ((r[0], int(r[1]) if r[1] else None) for r in read_csv(time_to_wro)) for row_chunk in chunked(time_to_wro_iter, 50): try: c = db_conn.cursor() c.executemany("INSERT INTO time_to_wroclaw VALUES (?,?)", row_chunk) c.close() db_conn.commit() except sqlite3.IntegrityError as e: log.warning(f"Could not insert {len(row_chunk)} rows [{row_chunk[0]}, ..., {row_chunk[-1]}]: {e}") for inet_curr_csv in list_csv_files(inet_curr): log.info(f"Inserting current broadband data from CSVs under {inet_curr_csv} into DB...") inet_curr_rows = filter(None, map(_filter_fields_from_curr_inet_csv_row, read_csv(inet_curr_csv, delimiter=';'))) broadband_accesses = filter(None, map(BroadbandAccess.from_csv_row, inet_curr_rows)) _insert_broadband_access_obj(broadband_accesses, db_conn, log) for inet_popc_csv in list_csv_files(inet_popc): log.info( f"Inserting planned broadband expansion data from CSVs under {inet_popc_csv} into DB...") inet_curr_rows = filter(None, map(_filter_fields_from_planned_inet_csv_row, read_csv(inet_popc_csv, delimiter=';'))) broadband_accesses = filter(None, map(BroadbandAccess.from_csv_row, inet_curr_rows)) _insert_broadband_access_obj(broadband_accesses, db_conn, log) for offers_csv in list_csv_files(offers_path): log.info(f"Inserting Offers from CSV {offers_csv} into sqlite3 DB {db_file}...") offers = filter(None, map(ParcelOffer.from_csv_row, read_csv(offers_csv))) for offers_chunk in chunked(map(lambda p: p.to_sql_row(), offers), 50): try: c = db_conn.cursor() c.executemany("INSERT INTO parcel_offer VALUES (?,?,?,?,?,?,?)", offers_chunk) c.close() db_conn.commit() except sqlite3.IntegrityError as e: log.warning( f"Could not insert {len(offers_chunk)} rows [{offers_chunk[0]}, ..., {offers_chunk[-1]}]: {e}") db_conn.close() log.info("Done inserting Places cache and Offers into sqlite3 DB")
def main(isochrone: str, place_cache: str, output: str, polygon_step_time_min: int = 7): setup_log() log = getLogger() log.info(f"Reading isochrone map from {isochrone} ...") with codecs.open(isochrone, 'r', 'utf-8-sig') as map_: isochrone_map = json.load(map_) polygons = _build_polygons(isochrone_map) log.info(f"Reading places cache from {place_cache} ...") places = filter(None, map(Place.from_csv_row, read_csv(place_cache))) city_to_time_to_wroclaw: Dict[str, Optional[int]] = {} log.info(f"Finding time to reach destination for places...") for p in places: if p not in city_to_time_to_wroclaw.keys(): index = _index_of_polygon_point_is_in(p.lat, p.lon, polygons) if index != -1: time_to_wroclaw_min = index * polygon_step_time_min city_to_time_to_wroclaw[p.city] = time_to_wroclaw_min else: city_to_time_to_wroclaw[p.city] = None log.info(f"Writing {len(city_to_time_to_wroclaw)} results to {output} ...") write_csv(output, sorted([[k, v] for k, v in city_to_time_to_wroclaw.items()])) log.info("Done")
def read_spot_commit(): spot_commit_filepath = os.path.join(common.data_dir, "Receipt/抽出データ_修正/Looop_スポット_約定.csv") df = common.read_csv(spot_commit_filepath, parse_dates=["日時"]) return df
def main(broadband_city_csv: str, geojson: str): setup_log() log = getLogger() colors = color_gradient("#737373", "#F2F2F2", 3) points = [] for i, cells in enumerate(read_csv(broadband_city_csv)): if i == 0 or not cells: continue try: city, ap_count = cells[0], int(cells[1]) min_bw, avg_bw, max_bw = int(cells[2]), float(cells[3]), int( cells[4]) lat, lon = randomize_coordinates(float(cells[5]), float(cells[6]), delta=0.004) color_class = calc_class( avg_bandwidth=avg_bw) if max_bw < 100 else 2 point = render_geojson_point(lat=lat, lon=lon, marker_color=colors[color_class], marker_symbol='star', props={ 'title': city, 'min-bandwidth': min_bw, 'avg-bandwidth': avg_bw, 'max-bandwidth': max_bw, 'bandwidth-class': color_class }) points.append(point) except (TypeError, ValueError, LookupError) as e: log.warning(f"Could not parse: {e}, cells: {cells}") if points: save_geojson(points, geojson)
def main(avg_city_prices_csv: str, output_geojson: str, headers: bool = False) -> None: setup_log() log = getLogger() log.info( f"Parsing CSV {avg_city_prices_csv} file into {output_geojson} GeoJSON..." ) csv_lines = list(read_csv(avg_city_prices_csv)) if headers: _ = csv_lines.pop(0) colors = color_gradient("red", "green", 10) points = [ render_geojson_point(lat=float(t[3]), lon=float(t[4]), marker_size="small", marker_color=colors[_calc_value(int(float( t[1])))], props={ "title": t[0], "offer_count": int(float(t[2])), "price_per_sqm": f"{round(int(float(t[1])))} zł/m2" }) for t in csv_lines if t[0] ] log.info(f"Rendering GeoJSON out of {len(points)} points...") save_geojson(points, output_geojson) log.info(f"Done rendering file {output_geojson}")
def read_jepx_spot(): """JEPX SPOT取引結果ファイル読み込み""" jepx_spot_filepath = os.path.join(common.data_dir, "jepx_spot/spot_*.csv") jepx_spot = pd.concat( common.read_csv(f, parse_dates=["年月日"]) for f in glob.glob(jepx_spot_filepath)) return jepx_spot
def read_total_amount(): total_amount_filepath = os.path.join( common.data_dir, "Receipt/計画値_集計(201607~201704)修正版/計画値_集計(201607~201704)修正版.csv") df = common.read_csv(total_amount_filepath, parse_dates=["時間"]) return df
def read_jepx_1hour(): """JEPX時間前取引結果ファイル読み込み""" jikan_filepath = os.path.join(common.data_dir, "jepx_1hour/im_trade_summary_*.csv") df = pd.concat( common.read_csv(f, parse_dates=["年月日"]) for f in glob.glob(jikan_filepath)) return df
def special_calendar(): fixed_calendar_path = os.path.join(common.data_dir, "normal_calendar/fixed_calendar.csv") special_calendar_path = os.path.join( common.data_dir, "special_calendar/special_calendar.csv") fixed = common.read_csv(fixed_calendar_path) sp = common.read_csv(special_calendar_path) fixed["年月日"] = pd.to_datetime(fixed["年月日"]) sp["年月日"] = pd.to_datetime(sp["年月日"]) df = fixed.merge(sp, how="left", on="年月日") trend = df["平日"].mask(df["平日"] == 1, "平日") trend = trend.mask(df["土曜"] == 1, "土曜") trend = trend.mask(df["日祝"] == 1, "日祝") trend = trend.mask(df["特別判定"].notnull(), df["特別判定"]) df["trend"] = trend return df
def make(): """通常カレンダーの作成 http://calendar-service.net/api.php から取得したデータをもとに作成 このプログラムは利用サイト(終了年月の継続可能性が確実ではない、 かつ取得内容が変更されるかもしれないので実行結果を 保証できません。 そのため、2037年までのデータを一括で取得しています。 今後、祝日が変更や追加された際には、 data/fixed_calendar.csv を修正してください。 ※このプログラムを実行する際は、 ※事前にディレクトリへ整形前CSVをダウンロードしておくこと [選択内容] 開始年月の入力 終了年月の入力 年表記:西暦 月表記:数字のみ 曜日評議:日本語(日、月) 出力フォーマット:CSV 出力内容:チェックをしない ゼロ埋め:チェックをする """ # 年,月,日,年号,和暦,曜日,曜日番号,祝日名 calendar_path = os.path.join(common.data_dir, "normal_calendar/calendar.csv") fixed_calendar_path = os.path.join(common.data_dir, "normal_calendar/fixed_calendar.csv") df = common.read_csv(calendar_path, encoding="utf8") def to_datetime(row): return pd.datetime(int(row["年"]), int(row["月"]), int(row["日"])).date() date = df.apply(to_datetime, axis=1) fixed_df = pd.DataFrame() fixed_df["年月日"] = date fixed_df["曜日"] = df["曜日"] fixed_df["祝日名"] = df["祝日名"] fixed_df["平日"] = df["曜日"].isin(list("月火水木金")).astype(int) fixed_df["土曜"] = (df["曜日"] == "土").astype(int) fixed_df["日祝"] = ((df["曜日"] == "日") | (df["祝日名"].notnull())).astype(int) fixed_df.to_csv(fixed_calendar_path, index=False)
def main(avg_city_prices_csv: str, output_geojson: str, headers: bool = False) -> None: setup_log() log = getLogger() log.info( f"Parsing CSV {avg_city_prices_csv} file into {output_geojson} GeoJSON..." ) csv_lines = list(read_csv(avg_city_prices_csv)) if headers: _ = csv_lines.pop(0) colors = color_gradient("red", "green", 10) points = list( filter(None, [row_to_point(t, colors, log) for t in csv_lines if t[0]])) log.info(f"Rendering GeoJSON out of {len(points)} points...") save_geojson(points, output_geojson) log.info(f"Done rendering file {output_geojson}")
def main(): print("Processing files!") while True: try: file_names = get_files_paths( f"{get_current_folder_path()}/{RAW_FILES}") for file_name in file_names: path = f"{get_current_folder_path()}/{RAW_FILES}/{file_name}" data = read_csv(path, ("region", "tweet")) processing_data(data) move_twitter_file(file_name) print(f"processed: {file_name}") except Exception as e: print(f"Error: {str(e)}") raise e finally: print("Sleeping") minutes = 1 time.sleep(minutes * 60) print("Proccessing again")
def main(map_quest_api_key: str, csv_cache: str, offers_directory: str): setup_log() log = getLogger() client = MapQuestClient(map_quest_api_key, log) resolver = PlaceResolver(client, log) if path.isfile(csv_cache): resolver.load(csv_cache) log.info( f"Loaded {csv_cache} with {len(resolver.cache.keys())} addresses") for csv_file in list_csv_files(offers_directory): log.info(f"Parsing CSV {csv_file}") for row in read_csv(csv_file): offer = ParcelOffer.from_csv_row(row) if offer: _ = resolver.get(offer) else: log.warning(f"Could not parse into offer: {row}") log.info( f"Storing cache with {len(resolver.cache.keys())} into {csv_cache}" ) resolver.save(csv_cache)
import pandas as pd import numpy as np import gc from sklearn.linear_model import Ridge from sklearn.model_selection import KFold from common import timer, read_csv, ItemSelector fp_col = [f'fp_{i}' for i in range(167)] w2v_col = [f'w2v_{i}' for i in range(128)] with timer("Load data"): df_affinity_train = read_csv("df_affinity_train.csv") df_molecule = read_csv("df_molecule_stat.csv", "./input/temp/") df_protein_w2v = read_csv("df_w2v_ws3.csv", "./input/temp/") df_affinity_test = read_csv("df_affinity_test_toBePredicted.csv") df_train = df_affinity_train.merge(df_molecule) df_train = df_train.merge(df_protein_w2v) df_test = df_affinity_test.merge(df_molecule) df_test = df_test.merge(df_protein_w2v) test = df_test all_protein_id = df_train.Protein_ID.unique() kfold = KFold(n_splits=5, shuffle=True, random_state=2018)
def read(): """分析用CSVを読み込み""" return common.read_csv(output_path, parse_dates=["fc_datetime"])
def load(self, csv_cache): with self._cache_lock: for line in read_csv(csv_cache): self.cache[line[0]] = Place.from_csv_row(line)
def read_price_to_amount(): price_to_amount_path = os.path.join(common.data_dir, "Processed/price_to_amount.csv") df = common.read_csv(price_to_amount_path) return df
# print('--> training') return Training() elif pathname == config.classification_url: # print('--> classification') return Classification() # home page ############################################################################### # login page ############################################################################## # training page ########################################################################### input_csv = "static/genomic.csv" expected_label = '' df = common.read_csv(input_csv) def loadNewVariant(): """ Load a random variant from the CSV Return: data_s (DataFrame): loaded data """ # print(' *** loadNewVariant()') data_s = df.sample() return data_s
import common import copy import math import backpropogation as b import random from random import shuffle import metrics_testing as mt import GetData as data_handler import KNearestNeighbor as Knn import numpy as np # DataSet URL: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ (breast-cancer-wisconsin.data) #Class 2 for benign and 4 for malignant data = common.read_csv( "C:/Users/andre/PycharmProjects/MachineLearningFinal/Data/breastCancer.csv" ) # since first feature is just an id number, this doesn't provide any useful information common.remove_nth_column(data, 0) class_index = 9 first_attribute_index = 0 last_attribute_index = 8 #update class lables 2=>0 and 4=> 1 for point in data: if point[class_index] == '2': point[class_index] = '0' elif point[class_index] == '4': point[class_index] = '1'
def read_b_value(): """β値ファイル読み込み""" b_value_filepath = os.path.join(common.data_dir, "Processed/B値.csv") df = common.read_csv(b_value_filepath) return df
import pandas as pd import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.pipeline import FeatureUnion, Pipeline, make_union, make_pipeline from common import timer, read_csv, ItemSelector, TextStats with timer("Load data"): df_protein_train = read_csv("df_protein_train.csv") df_protein_test = read_csv("df_protein_test.csv") df_protein = pd.concat([df_protein_train, df_protein_test]) df_protein.Sequence = df_protein.Sequence.apply(lambda x: x.upper()) feature_union = make_union( make_pipeline(ItemSelector(key="Sequence"), CountVectorizer(analyzer='char', ngram_range=(1, 1))), make_pipeline( ItemSelector(key="Sequence"), TfidfVectorizer(analyzer='char', ngram_range=(1, 1), use_idf=False)), make_pipeline(ItemSelector(key="Sequence"), TextStats(), DictVectorizer())) with timer("Fit feature_union"): feat = feature_union.fit_transform(df_protein) out_col = [f'protein_stat_{i}' for i in range(feat.shape[1])] output_file = "./input/temp/df_protein_stat.csv" with timer(f"Save file to {output_file}"):
def get_sentiments_from_csv(): return read_csv(f"{FINAL_FILE}/final_file.csv", ("region", "positive", "neutral", "negative"))
import common import numpy as np s = common.read_csv("imbalance_server_bimodal_cpu/Cheetah_-_AWRRLAT99.csv") p = common.read_csv("imbalance_server_bimodal_cpu/Cheetah_-_Pow2LAT99.csv") c = common.read_csv("imbalance_server_bimodal_cpu/Cheetah_-_RRLAT99.csv") h = common.read_csv("imbalance_server_bimodal_cpu/Hash_RSSLAT99.csv") sp = np.nanmean(s[:, 1:], axis=1) cp = np.nanmean(c[:, 1:], axis=1) hp = np.nanmean(h[:, 1:], axis=1) pp = np.nanmean(p[:, 1:], axis=1) print("Awrr vs RR", cp / sp) print("Awrr vs Hash", hp / sp) print("Pow2 vs RR", cp / pp) print("Pow2 vs Hash", hp / pp)
import pandas as pd import numpy as np from common import timer, read_csv fp_col = [f'fp_{i}' for i in range(167)] out_col = ["molecule_count"] + fp_col output_file = "./input/temp/df_molecule_stat.csv" with timer("Load data"): df_molecule = read_csv("df_molecule.csv") df_aff_train = read_csv("df_affinity_train.csv") df_aff_test = read_csv("df_affinity_test_toBePredicted.csv") df_aff = pd.concat([df_aff_train, df_aff_test]) with timer("Make molecule count feature"): df_molecule_count = df_aff.groupby("Molecule_ID", as_index=False).Ki.agg( {"molecule_count": "count"}) df_molecule = df_molecule.merge(df_molecule_count, on=["Molecule_ID"]) with timer("Parse fingerprint"): fingerprint = df_molecule.Fingerprint.apply( lambda x: np.array(x.split(', '))).values fingerprint = np.vstack(fingerprint).astype(np.uint8) df_fingerprint = pd.DataFrame(fingerprint, columns=fp_col, dtype=np.uint8) df_molecule = pd.concat([df_molecule, df_fingerprint], axis=1) del df_fingerprint, fingerprint with timer(f"Save file to {output_file}"): df_molecule[['Molecule_ID'] + out_col].to_csv(output_file, index=False)
import common import copy import math import backpropogation as b import random from random import shuffle import metrics_testing as mt import GetData as data_handler #URL https://archive.ics.uci.edu/ml/datasets/Heart+Disease data = common.read_csv('C:/Users/andre/PycharmProjects/MachineLearningFinal/Data/heartDisease.csv') class_index = 13 class_values = [0, 1, 2, 3, 4] #remove data points with missing attributes (since there are only 16 out of over 600 data points) common.remove_points_with_missing_attributes(data) shuffle(data) def split_data_in_ten_parts(data, class_index): list1 = [] list2 = [] list3 = []
import common import copy import math import backpropogation as b import random from random import shuffle import metrics_testing as mt import GetData as data_handler import KNearestNeighbor as Knn import numpy as np #URL for dataset https://archive.ics.uci.edu/ml/datasets/glass+identification data = common.read_csv('C:/Users/andre/PycharmProjects/MachineLearningFinal/Data/glass.csv') # since first feature is just an id number, this doesn't provide any useful information common.remove_nth_column(data, 0) class_index = 9 first_attribute_index = 0 last_attribute_index = 8 #update class lables 1=>0, 2=> 1, 3=>2, 4=>3, 5=>4, 6=>5, 7=>6 for point in data: val = float(point[class_index]) - 1 point[class_index] = str(val) class_values = [0, 1, 2, 3, 4, 5, 6]
import pandas as pd import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from scipy import sparse from sklearn.pipeline import FeatureUnion,Pipeline,make_union,make_pipeline from scipy.sparse import coo_matrix from common import timer,read_csv,ItemSelector,TextStats path='../data/' with timer("Load data"): df_apps_train = read_csv(path+"train/apps.csv") df_apps_test = read_csv(path+"test/apps.csv") df_apps = pd.concat([df_apps_train,df_apps_test]) feature_union = make_union( make_pipeline(ItemSelector(key="apps"),CountVectorizer(analyzer='word',ngram_range=(1,1))), make_pipeline(ItemSelector(key="apps"),TfidfVectorizer(analyzer='word',ngram_range=(1,1),use_idf=False)), make_pipeline(ItemSelector(key="apps"),TextStats(), DictVectorizer()) ) with timer("Fit feature_union"): feat = feature_union.fit_transform(df_apps) out_col = ["app_stat_%s" % (str(i)) for i in range(feat.shape[1])] output_file='123' with timer("Save file to %s" % (output_file)): data=coo_matrix((feat.todense())).tocsr() sparse.save_npz('../data/train/train_app_stat.npz',data[:df_apps_train.shape[0]])