def transform(file):

    start = time.time()
    logger.info("============== Feature Selection start ==============")
    logger.info("[start] : {}".format(str(start)))
    logger.info("DATA_PATH : {}".format(DATA_PATH))

    ##########################################
    # Config
    ##########################################

    ##########################################
    # feature Selection
    ##########################################

    df = pd.read_csv(save_file, index_col=0)
    tqdm.pandas()
    print(df.shape)
    drop_col = ['target', "$$$"]

    x = df.drop(drop_col, axis=1)

    y = df['target']

    x.fillna(0, inplace=True)

    logger.info("type transform")

    x = x.astype(int)
    x = mem_ext(x)
    return x, y
Esempio n. 2
0
def preprocess_column(cfg,
                      df_data,
                      column_name,
                      do_lemmatize=True,
                      no_stopwords=True):
    tqdm.pandas()
    df_data = df_data[pd.notnull(df_data[column_name])]
    df_data['temp'] = df_data[cfg.get(
        'postgres', 'column')].progress_apply(lambda x: remove_all_tables(x))
    df_data['readable_text'] = df_data['temp'].progress_apply(
        lambda x: get_readable_text(x))
    df_data.drop(['temp'], axis=1, inplace=True)
    df_data['processed_value'] = df_data['readable_text'].progress_apply(
        lambda x: clean_text(x))
    if do_lemmatize:
        parser = spacy.load('en', disable=['parser', 'ner'])
        df_data['processed_value'] = df_data['processed_value'].progress_apply(
            lambda x: lemmatize(x, parser))
    if no_stopwords:
        df_data['processed_value'] = df_data['processed_value'].progress_apply(
            lambda x: ' '.join([
                word for word in x.split()
                if word not in (text.ENGLISH_STOP_WORDS)
            ]))
    df_data = df_data[pd.notnull(df_data[column_name])]
    return df_data
def genotype_to_iupac(geno, thresh=0.5, progress=True):
    if progress:
        tqdm.pandas()
        out = (geno.stack('strain').progress_apply(
            fuzzy_allele_to_iupac, axis='columns').unstack('strain'))
    else:
        out = (geno.stack('strain').apply(fuzzy_allele_to_iupac,
                                          axis='columns').unstack('strain'))

    return out
#TQDM build
def tqdm_pandas(t):
  from pandas.core.frame import Series
  def inner(series, func, *args, **kwargs):
      t.total = series.size
      def wrapper(*args, **kwargs):
          t.update(1)
          return func(*args, **kwargs)
      result = series.apply(wrapper, *args, **kwargs)
      t.close()
      return result
  Series.progress_apply = inner

tqdm_pandas(tqdm_notebook())
tqdm.pandas(desc="my bar!")

#Feature engineering
SAMPLE_RATE = 22050
from scipy.stats import skew
print(os.listdir(os.getcwd()))    
tqdm.pandas
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)
train_files = glob(train_path + '*.wav')
test_files = glob(test_path + '*.wav')
SAMPLE_RATE = 22050
def get_feature(fname):
    #b,_ = librosa.load(fname, res_type = 'kaiser_fast')
    b,_ = librosa.load(fname, res_type = 'kaiser_fast')
    try:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(lineno)d - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

# This is for jupyter notebook
# from tqdm.notebook import tqdm_notebook
# tqdm_notebook.pandas()
# tqdm_func = tqdm_notebook
# This is for terminal
tqdm.pandas(desc="Progress")
tqdm_func = tqdm

# # Params

engine = create_engine(config.DB_STR)
logger.info("Logging to get line")
engine.connect()

out_nodes_table_name = "outnodes"
out_edges_table_name = "outedges"
in_nodes_table_name = "innodes"
in_edges_table_name = "inedges"

cards_graphs_as_json_to_table = f"{config.CARDS_JSON_TNAME}_temp"
Esempio n. 6
0
import glob
import json
import nussl
import os
import pandas as pd
import shutil
import numpy as np
import tqdm
import gin
import re
from tqdm import tqdm
import hashlib
import pickle
tqdm.pandas()

@gin.configurable
def construct_dataframe(json_path, sep_audio_path, og_audio_path, cache_location):
    json_files = glob.glob(f"{json_path}/**/*.json", recursive=True)

    hash_file = hashlib.sha224(" ".join(json_files).encode('utf-8')).hexdigest()
    hash_file = os.path.join(os.path.join(cache_location, hash_file))
    os.makedirs(cache_location, exist_ok=True)
    print(f"Writing or looking for {hash_file}")

    if os.path.exists(hash_file):
        with open(hash_file, 'rb') as f:
            df = pickle.load(f)
            return df

    df = nussl.evaluation.aggregate_score_files(json_files)
    df = df[df['source'] == 'vocals']
Esempio n. 7
0
import pandas as pd
import tqdm

try:
    ipy_str = str(type(get_ipython()))
    if "zmqshell" in ipy_str or "terminal" in ipy_str:
        from tqdm import tqdm, tqdm_notebook

        tqdm.pandas(tqdm_notebook)
except:
    from tqdm import tqdm

    tqdm.pandas()


def read_cdwow(filename):
    df = (
        # read the data
        pd.read_csv(
            filename,
            names=["id", "date", "num_purchased", "dollars"],
            header=None,
            sep=r"\s+",
        )
        # format the dates as dates
        .pipe(lambda x: x.assign(date=pd.to_datetime(x.date, format="%Y%m%d")))
        # calculate the average cost per CD in the basket
        .pipe(lambda x: x.assign(price_per_cd=x.dollars / x.num_purchased)))

    return df
import sklearn
import sklearn.ensemble
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import functions
################
## Requirements
################
tqdm.pandas(
    desc="my bar!"
)  ## Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`

################
## Data Preparation
################
fileName1 = "/home/huiyangd/toxicSpans/data/2018_01_n_1000_32.csv"
fileName2 = "/home/huiyangd/toxicSpans/data/2018_01_p_1000_32.csv"

RC_2018_01_n_1000_32 = pd.read_csv(fileName1)
RC_2018_01_p_1000_32 = pd.read_csv(fileName2)
frames = [RC_2018_01_n_1000_32, RC_2018_01_p_1000_32]
RC_2018_01_combined_1000 = pd.concat(frames)

class_names = list(RC_2018_01_combined_1000.toxicity.unique())