Python read_csvの例、common.read_csv Pythonの例

コード例 #1

0

ファイルを表示

def read_weather():
    path_1 = "Receipt/気象情報/札幌（天気【2016年4月1日~2017年3月30日】）.csv"
    path_3 = "Receipt/気象情報/東京（天気【2016年4月1日~2017年3月30日】）.csv"
    path_6 = "Receipt/気象情報/大阪（天気【2016年4月1日~2017年3月30日】）.csv"
    path_9 = "Receipt/気象情報/福岡（天気【2016年4月1日~2017年3月30日】）.csv"

    tenki_filepath_1 = os.path.join(common.data_dir, path_1)
    tenki_filepath_3 = os.path.join(common.data_dir, path_3)
    tenki_filepath_6 = os.path.join(common.data_dir, path_6)
    tenki_filepath_9 = os.path.join(common.data_dir, path_9)

    tenki_1 = common.read_csv(tenki_filepath_1,
                              parse_dates=["年月日時"],
                              skiprows=3)
    tenki_3 = common.read_csv(tenki_filepath_3,
                              parse_dates=["年月日時"],
                              skiprows=3)
    tenki_6 = common.read_csv(tenki_filepath_6,
                              parse_dates=["年月日時"],
                              skiprows=3)
    tenki_9 = common.read_csv(tenki_filepath_9,
                              parse_dates=["年月日時"],
                              skiprows=3)

    tenki_1 = tenki_1[["年月日時", "天気"]].rename(columns={
        "天気": "天気_num_北海道"
    }).drop(0)
    tenki_3 = tenki_3[["天気"]].rename(columns={"天気": "天気_num_東エリア"}).drop(0)
    tenki_6 = tenki_6[["天気"]].rename(columns={"天気": "天気_num_西エリア"}).drop(0)
    tenki_9 = tenki_9[["天気"]].rename(columns={"天気": "天気_num_九州"}).drop(0)

    tenki = pd.concat([tenki_1, tenki_3, tenki_6, tenki_9], axis=1)

    return tenki

コード例 #2

0

ファイルを表示

ファイル: import_into_db.py プロジェクト: emkor/estate-analysis

def main(db_file: str, ddl_script: str, place_cache: str, time_to_wro: str, offers_path: str, inet_curr: str,
         inet_popc: str):
    setup_log()
    log = getLogger()
    db_conn: sqlite3.Connection = sqlite3.connect(db_file)
    _init_tables(db_conn, ddl_script)

    log.info(f"Inserting Places cache {place_cache} into sqlite3 DB {db_file}...")
    places = filter(None, map(Place.from_csv_row, read_csv(place_cache)))
    for places_chunk in chunked(map(lambda p: p.to_sql_row(), places), 50):
        try:
            c = db_conn.cursor()
            c.executemany("INSERT INTO place VALUES (?,?,?,?,?)", places_chunk)
            c.close()
            db_conn.commit()
        except sqlite3.IntegrityError as e:
            log.warning(f"Could not insert {len(places_chunk)} rows [{places_chunk[0]}, ..., {places_chunk[-1]}]: {e}")

    log.info(f"Inserting time-to-wroclaw data from {time_to_wro} into DB...")
    time_to_wro_iter = ((r[0], int(r[1]) if r[1] else None) for r in read_csv(time_to_wro))
    for row_chunk in chunked(time_to_wro_iter, 50):
        try:
            c = db_conn.cursor()
            c.executemany("INSERT INTO time_to_wroclaw VALUES (?,?)", row_chunk)
            c.close()
            db_conn.commit()
        except sqlite3.IntegrityError as e:
            log.warning(f"Could not insert {len(row_chunk)} rows [{row_chunk[0]}, ..., {row_chunk[-1]}]: {e}")

    for inet_curr_csv in list_csv_files(inet_curr):
        log.info(f"Inserting current broadband data from CSVs under {inet_curr_csv} into DB...")
        inet_curr_rows = filter(None,
                                map(_filter_fields_from_curr_inet_csv_row,
                                    read_csv(inet_curr_csv, delimiter=';')))
        broadband_accesses = filter(None, map(BroadbandAccess.from_csv_row, inet_curr_rows))
        _insert_broadband_access_obj(broadband_accesses, db_conn, log)

    for inet_popc_csv in list_csv_files(inet_popc):
        log.info(
            f"Inserting planned broadband expansion data from CSVs under {inet_popc_csv} into DB...")
        inet_curr_rows = filter(None,
                                map(_filter_fields_from_planned_inet_csv_row, read_csv(inet_popc_csv, delimiter=';')))
        broadband_accesses = filter(None, map(BroadbandAccess.from_csv_row, inet_curr_rows))
        _insert_broadband_access_obj(broadband_accesses, db_conn, log)

    for offers_csv in list_csv_files(offers_path):
        log.info(f"Inserting Offers from CSV {offers_csv} into sqlite3 DB {db_file}...")
        offers = filter(None, map(ParcelOffer.from_csv_row, read_csv(offers_csv)))
        for offers_chunk in chunked(map(lambda p: p.to_sql_row(), offers), 50):
            try:
                c = db_conn.cursor()
                c.executemany("INSERT INTO parcel_offer VALUES (?,?,?,?,?,?,?)", offers_chunk)
                c.close()
                db_conn.commit()
            except sqlite3.IntegrityError as e:
                log.warning(
                    f"Could not insert {len(offers_chunk)} rows [{offers_chunk[0]}, ..., {offers_chunk[-1]}]: {e}")

    db_conn.close()
    log.info("Done inserting Places cache and Offers into sqlite3 DB")

コード例 #3

0

ファイルを表示

def main(isochrone: str,
         place_cache: str,
         output: str,
         polygon_step_time_min: int = 7):
    setup_log()
    log = getLogger()

    log.info(f"Reading isochrone map from {isochrone} ...")
    with codecs.open(isochrone, 'r', 'utf-8-sig') as map_:
        isochrone_map = json.load(map_)
    polygons = _build_polygons(isochrone_map)

    log.info(f"Reading places cache from {place_cache} ...")
    places = filter(None, map(Place.from_csv_row, read_csv(place_cache)))
    city_to_time_to_wroclaw: Dict[str, Optional[int]] = {}

    log.info(f"Finding time to reach destination for places...")
    for p in places:
        if p not in city_to_time_to_wroclaw.keys():
            index = _index_of_polygon_point_is_in(p.lat, p.lon, polygons)
            if index != -1:
                time_to_wroclaw_min = index * polygon_step_time_min
                city_to_time_to_wroclaw[p.city] = time_to_wroclaw_min
            else:
                city_to_time_to_wroclaw[p.city] = None

    log.info(f"Writing {len(city_to_time_to_wroclaw)} results to {output} ...")
    write_csv(output,
              sorted([[k, v] for k, v in city_to_time_to_wroclaw.items()]))
    log.info("Done")

コード例 #4

0

ファイルを表示

def read_spot_commit():
    spot_commit_filepath = os.path.join(common.data_dir,
                                        "Receipt/抽出データ_修正/Looop_スポット_約定.csv")

    df = common.read_csv(spot_commit_filepath, parse_dates=["日時"])

    return df

コード例 #5

0

ファイルを表示

def main(broadband_city_csv: str, geojson: str):
    setup_log()
    log = getLogger()
    colors = color_gradient("#737373", "#F2F2F2", 3)
    points = []
    for i, cells in enumerate(read_csv(broadband_city_csv)):
        if i == 0 or not cells:
            continue
        try:
            city, ap_count = cells[0], int(cells[1])
            min_bw, avg_bw, max_bw = int(cells[2]), float(cells[3]), int(
                cells[4])
            lat, lon = randomize_coordinates(float(cells[5]),
                                             float(cells[6]),
                                             delta=0.004)
            color_class = calc_class(
                avg_bandwidth=avg_bw) if max_bw < 100 else 2
            point = render_geojson_point(lat=lat,
                                         lon=lon,
                                         marker_color=colors[color_class],
                                         marker_symbol='star',
                                         props={
                                             'title': city,
                                             'min-bandwidth': min_bw,
                                             'avg-bandwidth': avg_bw,
                                             'max-bandwidth': max_bw,
                                             'bandwidth-class': color_class
                                         })
            points.append(point)
        except (TypeError, ValueError, LookupError) as e:
            log.warning(f"Could not parse: {e}, cells: {cells}")
    if points:
        save_geojson(points, geojson)

コード例 #6

0

ファイルを表示

def main(avg_city_prices_csv: str,
         output_geojson: str,
         headers: bool = False) -> None:
    setup_log()
    log = getLogger()
    log.info(
        f"Parsing CSV {avg_city_prices_csv} file into {output_geojson} GeoJSON..."
    )
    csv_lines = list(read_csv(avg_city_prices_csv))
    if headers:
        _ = csv_lines.pop(0)
    colors = color_gradient("red", "green", 10)
    points = [
        render_geojson_point(lat=float(t[3]),
                             lon=float(t[4]),
                             marker_size="small",
                             marker_color=colors[_calc_value(int(float(
                                 t[1])))],
                             props={
                                 "title":
                                 t[0],
                                 "offer_count":
                                 int(float(t[2])),
                                 "price_per_sqm":
                                 f"{round(int(float(t[1])))} zł/m2"
                             }) for t in csv_lines if t[0]
    ]
    log.info(f"Rendering GeoJSON out of {len(points)} points...")
    save_geojson(points, output_geojson)
    log.info(f"Done rendering file {output_geojson}")

コード例 #7

0

ファイルを表示

def read_jepx_spot():
    """JEPX SPOT取引結果ファイル読み込み"""
    jepx_spot_filepath = os.path.join(common.data_dir, "jepx_spot/spot_*.csv")
    jepx_spot = pd.concat(
        common.read_csv(f, parse_dates=["年月日"])
        for f in glob.glob(jepx_spot_filepath))

    return jepx_spot

コード例 #8

0

ファイルを表示

def read_total_amount():
    total_amount_filepath = os.path.join(
        common.data_dir,
        "Receipt/計画値_集計（201607～201704）修正版/計画値_集計（201607～201704）修正版.csv")

    df = common.read_csv(total_amount_filepath, parse_dates=["時間"])

    return df

コード例 #9

0

ファイルを表示

def read_jepx_1hour():
    """JEPX時間前取引結果ファイル読み込み"""
    jikan_filepath = os.path.join(common.data_dir,
                                  "jepx_1hour/im_trade_summary_*.csv")
    df = pd.concat(
        common.read_csv(f, parse_dates=["年月日"])
        for f in glob.glob(jikan_filepath))
    return df

コード例 #10

0

ファイルを表示

def special_calendar():
    fixed_calendar_path = os.path.join(common.data_dir,
                                       "normal_calendar/fixed_calendar.csv")
    special_calendar_path = os.path.join(
        common.data_dir, "special_calendar/special_calendar.csv")
    fixed = common.read_csv(fixed_calendar_path)
    sp = common.read_csv(special_calendar_path)

    fixed["年月日"] = pd.to_datetime(fixed["年月日"])
    sp["年月日"] = pd.to_datetime(sp["年月日"])
    df = fixed.merge(sp, how="left", on="年月日")

    trend = df["平日"].mask(df["平日"] == 1, "平日")
    trend = trend.mask(df["土曜"] == 1, "土曜")
    trend = trend.mask(df["日祝"] == 1, "日祝")
    trend = trend.mask(df["特別判定"].notnull(), df["特別判定"])

    df["trend"] = trend

    return df

コード例 #11

0

ファイルを表示

def make():
    """通常カレンダーの作成
    http://calendar-service.net/api.php から取得したデータをもとに作成

    このプログラムは利用サイト(終了年月の継続可能性が確実ではない、
    かつ取得内容が変更されるかもしれないので実行結果を
    保証できません。

    そのため、2037年までのデータを一括で取得しています。
    今後、祝日が変更や追加された際には、
    data/fixed_calendar.csv を修正してください。

    ※このプログラムを実行する際は、
    ※事前にディレクトリへ整形前CSVをダウンロードしておくこと

    [選択内容]
    開始年月の入力
    終了年月の入力
    年表記：西暦
    月表記：数字のみ
    曜日評議：日本語（日、月）
    出力フォーマット：CSV
    出力内容：チェックをしない
    ゼロ埋め：チェックをする
    """
    # 年,月,日,年号,和暦,曜日,曜日番号,祝日名
    calendar_path = os.path.join(common.data_dir,
                                 "normal_calendar/calendar.csv")
    fixed_calendar_path = os.path.join(common.data_dir,
                                       "normal_calendar/fixed_calendar.csv")

    df = common.read_csv(calendar_path, encoding="utf8")

    def to_datetime(row):
        return pd.datetime(int(row["年"]), int(row["月"]), int(row["日"])).date()

    date = df.apply(to_datetime, axis=1)

    fixed_df = pd.DataFrame()
    fixed_df["年月日"] = date
    fixed_df["曜日"] = df["曜日"]
    fixed_df["祝日名"] = df["祝日名"]
    fixed_df["平日"] = df["曜日"].isin(list("月火水木金")).astype(int)
    fixed_df["土曜"] = (df["曜日"] == "土").astype(int)
    fixed_df["日祝"] = ((df["曜日"] == "日") | (df["祝日名"].notnull())).astype(int)

    fixed_df.to_csv(fixed_calendar_path, index=False)

コード例 #12

0

ファイルを表示

def main(avg_city_prices_csv: str,
         output_geojson: str,
         headers: bool = False) -> None:
    setup_log()
    log = getLogger()
    log.info(
        f"Parsing CSV {avg_city_prices_csv} file into {output_geojson} GeoJSON..."
    )
    csv_lines = list(read_csv(avg_city_prices_csv))
    if headers:
        _ = csv_lines.pop(0)
    colors = color_gradient("red", "green", 10)
    points = list(
        filter(None,
               [row_to_point(t, colors, log) for t in csv_lines if t[0]]))
    log.info(f"Rendering GeoJSON out of {len(points)} points...")
    save_geojson(points, output_geojson)
    log.info(f"Done rendering file {output_geojson}")

コード例 #13

0

ファイルを表示

def main():
    print("Processing files!")
    while True:
        try:
            file_names = get_files_paths(
                f"{get_current_folder_path()}/{RAW_FILES}")
            for file_name in file_names:
                path = f"{get_current_folder_path()}/{RAW_FILES}/{file_name}"
                data = read_csv(path, ("region", "tweet"))
                processing_data(data)
                move_twitter_file(file_name)
                print(f"processed: {file_name}")
        except Exception as e:
            print(f"Error: {str(e)}")
            raise e
        finally:
            print("Sleeping")
            minutes = 1
            time.sleep(minutes * 60)
            print("Proccessing again")

コード例 #14

0

ファイルを表示

ファイル: cache_places.py プロジェクト: emkor/estate-analysis

def main(map_quest_api_key: str, csv_cache: str, offers_directory: str):
    setup_log()
    log = getLogger()
    client = MapQuestClient(map_quest_api_key, log)
    resolver = PlaceResolver(client, log)

    if path.isfile(csv_cache):
        resolver.load(csv_cache)
        log.info(
            f"Loaded {csv_cache} with {len(resolver.cache.keys())} addresses")

    for csv_file in list_csv_files(offers_directory):
        log.info(f"Parsing CSV {csv_file}")
        for row in read_csv(csv_file):
            offer = ParcelOffer.from_csv_row(row)
            if offer:
                _ = resolver.get(offer)
            else:
                log.warning(f"Could not parse into offer: {row}")
        log.info(
            f"Storing cache with {len(resolver.cache.keys())} into {csv_cache}"
        )
        resolver.save(csv_cache)

コード例 #15

0

ファイルを表示

ファイル: make_more_stacking.py プロジェクト: zjujdj/DataCastle-Drug-Competition-1th-Place

import pandas as pd
import numpy as np

import gc

from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

from common import timer, read_csv, ItemSelector

fp_col = [f'fp_{i}' for i in range(167)]
w2v_col = [f'w2v_{i}' for i in range(128)]

with timer("Load data"):
    df_affinity_train = read_csv("df_affinity_train.csv")
    df_molecule = read_csv("df_molecule_stat.csv", "./input/temp/")
    df_protein_w2v = read_csv("df_w2v_ws3.csv", "./input/temp/")
    df_affinity_test = read_csv("df_affinity_test_toBePredicted.csv")

df_train = df_affinity_train.merge(df_molecule)
df_train = df_train.merge(df_protein_w2v)

df_test = df_affinity_test.merge(df_molecule)
df_test = df_test.merge(df_protein_w2v)

test = df_test
all_protein_id = df_train.Protein_ID.unique()

kfold = KFold(n_splits=5, shuffle=True, random_state=2018)

コード例 #16

0

ファイルを表示

def read():
    """分析用CSVを読み込み"""
    return common.read_csv(output_path, parse_dates=["fc_datetime"])

コード例 #17

0

ファイルを表示

ファイル: place_resolver.py プロジェクト: emkor/estate-analysis

 def load(self, csv_cache):
     with self._cache_lock:
         for line in read_csv(csv_cache):
             self.cache[line[0]] = Place.from_csv_row(line)

コード例 #18

0

ファイルを表示

def read_price_to_amount():
    price_to_amount_path = os.path.join(common.data_dir,
                                        "Processed/price_to_amount.csv")
    df = common.read_csv(price_to_amount_path)

    return df

コード例 #19

0

ファイルを表示

ファイル: index.py プロジェクト: camisimoes44/maestria_bioinfo

        # print('--> training')
        return Training()
    elif pathname == config.classification_url:
        # print('--> classification')
        return Classification()


# home page ###############################################################################

# login page ##############################################################################

# training page ###########################################################################

input_csv = "static/genomic.csv"
expected_label = ''
df = common.read_csv(input_csv)


def loadNewVariant():
    """
    Load a random variant from the CSV
    Return:
        data_s (DataFrame): loaded data
    """

    # print('  *** loadNewVariant()')
    data_s = df.sample()

    return data_s

コード例 #20

0

ファイルを表示

ファイル: TestBreastCancerKNN.py プロジェクト: andrewwerner1/MachineLearningFinal

import common
import copy
import math
import backpropogation as b
import random
from random import shuffle
import metrics_testing as mt
import GetData as data_handler
import KNearestNeighbor as Knn
import numpy as np

# DataSet URL: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ (breast-cancer-wisconsin.data)
#Class 2 for benign and 4 for malignant

data = common.read_csv(
    "C:/Users/andre/PycharmProjects/MachineLearningFinal/Data/breastCancer.csv"
)

# since first feature is just an id number, this doesn't provide any useful information
common.remove_nth_column(data, 0)

class_index = 9
first_attribute_index = 0
last_attribute_index = 8

#update class lables 2=>0 and 4=> 1
for point in data:
    if point[class_index] == '2':
        point[class_index] = '0'
    elif point[class_index] == '4':
        point[class_index] = '1'

コード例 #21

0

ファイルを表示

def read_b_value():
    """β値ファイル読み込み"""
    b_value_filepath = os.path.join(common.data_dir, "Processed/B値.csv")
    df = common.read_csv(b_value_filepath)

    return df

コード例 #22

0

ファイルを表示

ファイル: make_protein_stat.py プロジェクト: zjujdj/DataCastle-Drug-Competition-1th-Place

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.pipeline import FeatureUnion, Pipeline, make_union, make_pipeline

from common import timer, read_csv, ItemSelector, TextStats

with timer("Load data"):
    df_protein_train = read_csv("df_protein_train.csv")
    df_protein_test = read_csv("df_protein_test.csv")

df_protein = pd.concat([df_protein_train, df_protein_test])
df_protein.Sequence = df_protein.Sequence.apply(lambda x: x.upper())

feature_union = make_union(
    make_pipeline(ItemSelector(key="Sequence"),
                  CountVectorizer(analyzer='char', ngram_range=(1, 1))),
    make_pipeline(
        ItemSelector(key="Sequence"),
        TfidfVectorizer(analyzer='char', ngram_range=(1, 1), use_idf=False)),
    make_pipeline(ItemSelector(key="Sequence"), TextStats(), DictVectorizer()))

with timer("Fit feature_union"):
    feat = feature_union.fit_transform(df_protein)

out_col = [f'protein_stat_{i}' for i in range(feat.shape[1])]
output_file = "./input/temp/df_protein_stat.csv"

with timer(f"Save file to {output_file}"):

コード例 #23

0

ファイルを表示

def get_sentiments_from_csv():
    return read_csv(f"{FINAL_FILE}/final_file.csv",
                    ("region", "positive", "neutral", "negative"))

コード例 #24

0

ファイルを表示

import common
import numpy as np

s = common.read_csv("imbalance_server_bimodal_cpu/Cheetah_-_AWRRLAT99.csv")
p = common.read_csv("imbalance_server_bimodal_cpu/Cheetah_-_Pow2LAT99.csv")
c = common.read_csv("imbalance_server_bimodal_cpu/Cheetah_-_RRLAT99.csv")
h = common.read_csv("imbalance_server_bimodal_cpu/Hash_RSSLAT99.csv")
sp = np.nanmean(s[:, 1:], axis=1)
cp = np.nanmean(c[:, 1:], axis=1)
hp = np.nanmean(h[:, 1:], axis=1)
pp = np.nanmean(p[:, 1:], axis=1)
print("Awrr vs RR", cp / sp)
print("Awrr vs Hash", hp / sp)
print("Pow2 vs RR", cp / pp)
print("Pow2 vs Hash", hp / pp)

コード例 #25

0

ファイルを表示

import pandas as pd
import numpy as np

from common import timer, read_csv

fp_col = [f'fp_{i}' for i in range(167)]
out_col = ["molecule_count"] + fp_col
output_file = "./input/temp/df_molecule_stat.csv"

with timer("Load data"):
    df_molecule = read_csv("df_molecule.csv")
    df_aff_train = read_csv("df_affinity_train.csv")
    df_aff_test = read_csv("df_affinity_test_toBePredicted.csv")

df_aff = pd.concat([df_aff_train, df_aff_test])

with timer("Make molecule count feature"):
    df_molecule_count = df_aff.groupby("Molecule_ID", as_index=False).Ki.agg(
        {"molecule_count": "count"})
    df_molecule = df_molecule.merge(df_molecule_count, on=["Molecule_ID"])

with timer("Parse fingerprint"):
    fingerprint = df_molecule.Fingerprint.apply(
        lambda x: np.array(x.split(', '))).values
    fingerprint = np.vstack(fingerprint).astype(np.uint8)
    df_fingerprint = pd.DataFrame(fingerprint, columns=fp_col, dtype=np.uint8)
    df_molecule = pd.concat([df_molecule, df_fingerprint], axis=1)
    del df_fingerprint, fingerprint

with timer(f"Save file to {output_file}"):
    df_molecule[['Molecule_ID'] + out_col].to_csv(output_file, index=False)

コード例 #26

0

ファイルを表示

ファイル: TestHeartDisease.py プロジェクト: andrewwerner1/MachineLearningFinal

import common
import copy
import math
import backpropogation as b
import random
from random import shuffle
import metrics_testing as mt
import GetData as data_handler


#URL https://archive.ics.uci.edu/ml/datasets/Heart+Disease


data = common.read_csv('C:/Users/andre/PycharmProjects/MachineLearningFinal/Data/heartDisease.csv')





class_index = 13
class_values = [0, 1, 2, 3, 4]

#remove data points with missing attributes (since there are only 16 out of over 600 data points)
common.remove_points_with_missing_attributes(data)

shuffle(data)

def split_data_in_ten_parts(data,  class_index):
    list1 = []
    list2 = []
    list3 = []

コード例 #27

0

ファイルを表示

ファイル: TestGlassHybrid2.py プロジェクト: andrewwerner1/MachineLearningFinal

import common
import copy
import math
import backpropogation as b
import random
from random import shuffle
import metrics_testing as mt
import GetData as data_handler
import KNearestNeighbor as Knn
import numpy as np

#URL for dataset https://archive.ics.uci.edu/ml/datasets/glass+identification


data = common.read_csv('C:/Users/andre/PycharmProjects/MachineLearningFinal/Data/glass.csv')

# since first feature is just an id number, this doesn't provide any useful information
common.remove_nth_column(data, 0)

class_index = 9
first_attribute_index = 0
last_attribute_index = 8

#update class lables 1=>0, 2=> 1, 3=>2, 4=>3, 5=>4, 6=>5, 7=>6
for point in data:
    val = float(point[class_index]) - 1
    point[class_index] = str(val)


class_values = [0, 1, 2, 3, 4, 5, 6]

コード例 #28

0

ファイルを表示

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from scipy import sparse
from sklearn.pipeline import FeatureUnion,Pipeline,make_union,make_pipeline
from scipy.sparse import coo_matrix
from common import timer,read_csv,ItemSelector,TextStats

path='../data/'

with timer("Load data"):
    df_apps_train = read_csv(path+"train/apps.csv")
    df_apps_test = read_csv(path+"test/apps.csv")

df_apps = pd.concat([df_apps_train,df_apps_test])

feature_union = make_union(
        make_pipeline(ItemSelector(key="apps"),CountVectorizer(analyzer='word',ngram_range=(1,1))),
        make_pipeline(ItemSelector(key="apps"),TfidfVectorizer(analyzer='word',ngram_range=(1,1),use_idf=False)),
        make_pipeline(ItemSelector(key="apps"),TextStats(), DictVectorizer())
        )

with timer("Fit feature_union"):
        feat = feature_union.fit_transform(df_apps)

out_col = ["app_stat_%s" % (str(i)) for i in range(feat.shape[1])]
output_file='123'
with timer("Save file to %s" % (output_file)):
    data=coo_matrix((feat.todense())).tocsr()
    sparse.save_npz('../data/train/train_app_stat.npz',data[:df_apps_train.shape[0]])