def test_pandas_rolling_expanding():
    """Test pandas.(Series|DataFrame).(rolling|expanding)"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)

        series = pd.Series(randint(0, 50, (123,)))
        res1 = series.rolling(10).progress_apply(lambda x: 1, raw=True)
        res2 = series.rolling(10).apply(lambda x: 1, raw=True)
        assert res1.equals(res2)

        res3 = series.expanding(10).progress_apply(lambda x: 2, raw=True)
        res4 = series.expanding(10).apply(lambda x: 2, raw=True)
        assert res3.equals(res4)

        expects = ['114it']  # 123-10+1
        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 2:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n{1}\n".format(
                        exres + " at least twice.", our_file.read()))
def test_pandas_series():
    """Test pandas.Series.progress_apply and .progress_map"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)

        series = pd.Series(randint(0, 50, (123,)))
        res1 = series.progress_apply(lambda x: x + 10)
        res2 = series.apply(lambda x: x + 10)
        assert res1.equals(res2)

        res3 = series.progress_map(lambda x: x + 10)
        res4 = series.map(lambda x: x + 10)
        assert res3.equals(res4)

        expects = ['100%', '123/123']
        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 2:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n{1}\n".format(
                        exres + " at least twice.", our_file.read()))
Example #3
0
def test_pandas_groupby_apply():
    """ Test pandas.DataFrame.groupby(...).progress_apply """
    try:
        from numpy.random import randint
        import pandas as pd
    except:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=False, ascii=True)

        df = pd.DataFrame(randint(0, 50, (500, 3)))
        df.groupby(0).progress_apply(lambda x: None)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)),
                           columns=list('abc'))
        dfs.groupby(['a']).progress_apply(lambda x: None)

        our_file.seek(0)

        # don't expect final output since no `leave` and
        # high dynamic `miniters`
        nexres = '100%|##########|'
        if nexres in our_file.read():
            our_file.seek(0)
            raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format(
                nexres, our_file.read()))
def test_pandas_groupby_apply():
    """Test pandas.DataFrame.groupby(...).progress_apply"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=False, ascii=True)

        df = pd.DataFrame(randint(0, 50, (500, 3)))
        df.groupby(0).progress_apply(lambda x: None)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc'))
        dfs.groupby(['a']).progress_apply(lambda x: None)

        our_file.seek(0)

        # don't expect final output since no `leave` and
        # high dynamic `miniters`
        nexres = '100%|##########|'
        if nexres in our_file.read():
            our_file.seek(0)
            raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format(
                nexres, our_file.read()))

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc'))
        dfs.loc[0] = [2, 1, 1]
        dfs['d'] = 100

        expects = ['500/500', '1/1', '4/4', '2/2']
        dfs.groupby(dfs.index).progress_apply(lambda x: None)
        dfs.groupby('d').progress_apply(lambda x: None)
        dfs.groupby(dfs.columns, axis=1).progress_apply(lambda x: None)
        dfs.groupby([2, 2, 1, 1], axis=1).progress_apply(lambda x: None)

        our_file.seek(0)
        if our_file.read().count('100%') < 4:
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format(
                '100% at least four times', our_file.read()))

        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 1:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n {1}\n".format(
                        exres + " at least once.", our_file.read()))
def test_pandas_setup():
    """Test tqdm.pandas()"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True, total=123)
        series = pd.Series(randint(0, 50, (100,)))
        series.progress_apply(lambda x: x + 10)
        res = our_file.getvalue()
        assert '100/123' in res
Example #6
0
def test_pandas_map():
    """ Test pandas.Series.progress_map """
    try:
        from numpy.random import randint
        import pandas as pd
    except:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list("abc"))
        dfs.a.progress_map(lambda x: None)

        if our_file.getvalue().count("100%") < 1:
            raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format("100% at least twice", our_file.getvalue()))
Example #7
0
def map_translate(x):
    print("Begining to translate")
    tqdm.pandas(tqdm())
    x['en_desc']=x['description'].progress_map(translate)
    print("Done translating decription")
    print("Begining to translate Title")
    x['en_title']=x['title'].progress_map(translate)
    print("Done translating")
    print("Begining to translate region")
    x['en_region']=x['region'].progress_map(translate)
    print("Done translating")
    print("Begining to translate city")
    x['en_city']=x['city'].progress_map(translate)
    print("Done translating")
    print("Begining to translate category_name")
    x['en_category_name']=x['category_name'].progress_map(translate)
    print("Done translating")
    return x
Example #8
0
def test_pandas_leave():
    """ Test pandas with `leave=True` """
    try:
        from numpy.random import randint
        import pandas as pd
    except:
        raise SkipTest

    with closing(StringIO()) as our_file:
        df = pd.DataFrame(randint(0, 100, (1000, 6)))
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        df.groupby(0).progress_apply(lambda x: None)

        our_file.seek(0)

        exres = "100%|##########| 101/101"
        if exres not in our_file.read():
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format(exres, our_file.read()))
Example #9
0
def __debug_angle_error_per_number_of_bins_snr_and_samples():
    from tqdm import tqdm
    import pandas as pd
    import numpy as np
    tqdm.pandas()

    # a = [[10, 100, 1000, 10000], [100, 300], [100, 1000, 10000], [None]]
    # a = [[100], [100], [10000], [0.1, 0.01]]
    # inx = pd.MultiIndex.from_product(a, names=['samples', 'hist_bins', 'snr', 'quant_size'])
    a = dict(samples=[10, 100, 1000, 10000], hist_bins=[100, 300], snr=[100, 1000, 10000], quant_size=[0])
    a = dict(samples=[100], hist_bins=[100], snr=[10000], quant_size=[0.2, 0.1, 0.01, 0.0001])
    inx = pd.MultiIndex.from_product(a.values(), names=a.keys())

    df = pd.DataFrame(index=inx).reset_index(drop=False).sample(frac=1)
    df = pd.concat([df] * 10000, ignore_index=True)

    df = df.join(df.progress_apply(lambda row: compare_sinogram_and_eigen_vector(**row.to_dict()), axis=1).apply(pd.Series))
    df.to_csv('angle_error_by_sinogram.csv', header=None, mode='a')
    # df.to_csv('angle_error_by_sinogram.csv', mode='w')
    print(df.head())
Example #10
0
def main():

    # open
    df_train = pd.read_csv('../middle/train_lemma_stem.csv')
    df_test = pd.read_csv('../middle/test_lemma_stem.csv')

    # drop None
    df_train = df_train.replace(np.nan, ' ', regex=True)
    df_test = df_test.replace(np.nan, ' ', regex=True)

    # progress bars initialize
    tqdm.pandas(desc="my bar!")
    loop_tuple1 = [('punct_re1', 'question1'), ('punct_re2', 'question2')]
    loop_tuple2 = [('stem_q1', 'punct_re1'), ('stem_q2', 'punct_re2')]
    loop_tuple3 = [('lemmas_q1', 'punct_re1'), ('lemmas_q2', 'punct_re2')]

    # implement functions punct_re, spell_lemma, spell_stem to df_train and df_test dataframes
    df_train, df_test = implement_fun(df_train, df_test, punct_re, loop_tuple1)
    df_train, df_test = implement_fun(df_train, df_test, spell_lemma, loop_tuple2)
    df_train, df_test = implement_fun(df_train, df_test, spell_stem, loop_tuple3)
def test_pandas_data_frame():
    """Test pandas.DataFrame.progress_apply and .progress_applymap"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        df = pd.DataFrame(randint(0, 50, (100, 200)))

        def task_func(x):
            return x + 1

        # applymap
        res1 = df.progress_applymap(task_func)
        res2 = df.applymap(task_func)
        assert res1.equals(res2)

        # apply
        for axis in [0, 1, 'index', 'columns']:
            res3 = df.progress_apply(task_func, axis=axis)
            res4 = df.apply(task_func, axis=axis)
            assert res3.equals(res4)

        our_file.seek(0)
        if our_file.read().count('100%') < 3:
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format(
                '100% at least three times', our_file.read()))

        # apply_map, apply axis=0, apply axis=1
        expects = ['20000/20000', '200/200', '100/100']
        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 1:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n {1}\n".format(
                        exres + " at least once.", our_file.read()))
Example #12
0
def test_pandas_apply():
    """ Test pandas.DataFrame[.series].progress_apply """
    try:
        from numpy.random import randint
        import pandas as pd
    except:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        df = pd.DataFrame(randint(0, 50, (500, 3)))
        df.progress_apply(lambda x: None)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)),
                           columns=list('abc'))
        dfs.a.progress_apply(lambda x: None)

        our_file.seek(0)

        if our_file.read().count('100%') < 2:
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:{1}\n".format(
                '100% at least twice', our_file.read()))
Example #13
0
    def postprocess(self, fps, sampling_rate=1, use_kalman=False):
        """
        This function should be called after loading the data by loader
        It performs the following steps:
        -: check fps value, should be set and bigger than 0
        -: check critical columns should exist in the table
        -: update data types
        -: fill 'groumates' if they are not set
        -: checks if velocity do not exist, compute it for each agent
        -: compute bounding box of trajectories

        :param fps: video framerate
        :param sampling_rate: if bigger than one, the data needs downsampling,
                              otherwise needs interpolation
        :param use_kalman:  for smoothing agent velocities
        :return: None
        """

        # check
        for critical_column in self.critical_columns:
            if critical_column not in self.data:
                raise ValueError(
                    "Error! some critical columns are missing from trajectory dataset!"
                )

        # modify data types
        self.data["frame_id"] = self.data["frame_id"].astype(int)
        if str(self.data["agent_id"].iloc[0]).replace('.', '', 1).isdigit():
            self.data["agent_id"] = self.data["agent_id"].astype(int)
        self.data["pos_x"] = self.data["pos_x"].astype(float)
        self.data["pos_y"] = self.data["pos_y"].astype(float)
        self.data["label"] = self.data["label"].str.lower(
        )  # search with lower-case labels

        # fill scene_id
        if "scene_id" not in self.data:
            self.data["scene_id"] = 0
        self.fps = fps

        # fill timestamps based on frame_id and video_fps
        if "timestamp" not in self.data:
            self.data["timestamp"] = self.data["frame_id"] / fps

        # fill groupmates
        agent_ids = pd.unique(self.data["agent_id"])
        for agent_id in agent_ids:
            if agent_id not in self.groupmates:
                self.groupmates[agent_id] = []

        # down/up sampling frames
        if sampling_rate >= 2:
            # FixMe: down-sampling
            sampling_rate = int(sampling_rate)
            self.data = self.data.loc[(self.data["frame_id"] %
                                       sampling_rate) == 0]
            self.data = self.data.reset_index()
        elif sampling_rate < (1 - 1E-2):
            # TODO: interpolation
            pass
        else:
            pass

        # remove the trajectories shorter than 2 frames
        data_grouped = self.data.groupby(["scene_id", "agent_id"])
        single_length_inds = data_grouped.head(1).index[
            data_grouped.size() < 2]
        self.data = self.data.drop(single_length_inds)

        # fill velocities
        if "vel_x" not in self.data:
            data_grouped = self.data.groupby(["scene_id", "agent_id"])
            dt = data_grouped["timestamp"].diff()

            if (dt > 2).sum():
                print('Warning! too big dt in [%s]' % self.title)

            self.data["vel_x"] = (data_grouped["pos_x"].diff() /
                                  dt).astype(float)
            self.data["vel_y"] = (data_grouped["pos_y"].diff() /
                                  dt).astype(float)
            nan_inds = np.array(np.nonzero(dt.isnull().to_numpy())).reshape(-1)
            self.data["vel_x"].iloc[nan_inds] = self.data["vel_x"].iloc[
                nan_inds + 1].to_numpy()
            self.data["vel_y"].iloc[nan_inds] = self.data["vel_y"].iloc[
                nan_inds + 1].to_numpy()

        # ============================================
        if use_kalman:

            def smooth(group):
                if len(group) < 2: return group
                dt = group["timestamp"].diff().iloc[1]
                kf = KalmanModel(dt, n_dim=2, n_iter=7)
                smoothed_pos, smoothed_vel = kf.smooth(
                    group[["pos_x", "pos_y"]].to_numpy())
                group["pos_x"] = smoothed_pos[:, 0]
                group["pos_y"] = smoothed_pos[:, 1]

                group["vel_x"] = smoothed_vel[:, 0]
                group["vel_y"] = smoothed_vel[:, 1]
                return group

            tqdm.pandas(desc="Smoothing trajectories (%s)" % self.title)
            # print('Smoothing trajectories ...')
            data_grouped = self.data.groupby(["scene_id", "agent_id"])
            self.data = data_grouped.progress_apply(smooth)
Example #14
0
# importando as bibliotecas
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas_profiling

%matplotlib inline
tqdm.pandas(desc="Operation Progress")

test_file = pd.read_csv("train.csv")
test_file = test_file.drop('ID_code',axis=1)
test_sample = test_file.sample(n=384,random_state=1)

# Definindo código para retornar features mais correlatas
# https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf
# Fazer df com amostra de 384
# Rodar os códigos abaixo e gerar um modelo fitando maiores correlações e rodando o predict

def get_redundant_pairs(test_sample):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = test_sample.columns
    for i in range(0, test_file.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(test_sample, n=5):
    au_corr = test_file.corr().abs().unstack()
Example #15
0
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [
        word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in mytokens
    ]
    mytokens = [
        word for word in mytokens
        if word not in stopwords and word not in punctuations
    ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens


# Applying the text-processing function on the body_text.
print(tqdm.pandas())
df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer)

# Let's take a look at word count in the papers
import seaborn as sns

sns.distplot(df['body_word_count'])
print(df['body_word_count'].describe())

sns.distplot(df['body_unique_words'])
print(df['body_unique_words'].describe())

# Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer
# Sentence size range from 7 to 7944 characters.

# ### 3.1 NLP

# Now I will tokenize and lemmatize the sentences.
#
# Tokenization separates the sentences into a set of tokens.
#
# Lemmatization aims at reducing the inflectional forms of each word into a common base or root. Lemmatization, on the other hand, takes into consideration the morphological analysis of the words.
#
# I still need to better understand about the stop words removal in the contaxt of Text Classification. I do believe (for now) that we should not remove the negative words at least. Some discussion [here](https://datascience.stackexchange.com/questions/31048/pros-cons-of-stop-word-removal), [here](https://www.researchgate.net/post/Does_Pre-processing_step_Remove_Stop_Word_effect_Sentiment_Analysis_result) and [here](https://stackoverflow.com/questions/40144473/do-we-need-to-use-stopwords-filtering-before-pos-tagging)

# In[17]:

tqdm.pandas()

# In[18]:

df['tokens'] = df['sentence'].progress_apply(lambda d: lemmatize_sent(d),
                                             df.sentence, [])
# df['tokens'] = df.sentence.progress_apply(lambda d: lemmatize_sent(d), [i for i in df.sentence.values], [])
# flatten = lambda data: reduce(lambda x, y: x + y, data, [])

# In[19]:

df['tokens'][:3]

# In[20]:

sum(df['sentence'].isnull())
Example #17
0
def urm_neg_score_user(mode,
                       _last_click_score=1,
                       _clicked_ref_score=1,
                       _impr_not_seen_score=0,
                       _seen_ref_score=1,
                       cluster='no_cluster'):
    global impr_not_seen_score, last_click_score, seen_ref_score, clicked_ref_score
    impr_not_seen_score = _impr_not_seen_score
    last_click_score = _last_click_score
    clicked_ref_score = _clicked_ref_score
    seen_ref_score = _seen_ref_score

    save_path = 'dataset/preprocessed/{}/{}/matrices/'.format(cluster, mode)

    accomodations_array = data.accomodations_ids()

    # load the dataframes according to the mode and cluster
    train_df = data.train_df(mode=mode, cluster=cluster)
    test_df = data.test_df(mode=mode, cluster=cluster)

    # fill missing clickout_item on the test dataframe
    test_df.fillna({'reference': -1}, inplace=True)
    train_df.fillna({'reference': -1}, inplace=True)

    # concatenate the train df and the test df mantaining only the columns of interest
    df = pd.concat([train_df, test_df])[[
        'session_id', 'user_id', 'action_type', 'reference', 'impressions'
    ]]

    session_groups = df.groupby(['user_id'])
    session_ids = list(session_groups.groups.keys())

    rows_count = len(session_groups)
    cols_count = len(accomodations_array)

    # create dictionary (k: sessionId - v: urm row)
    row_of_sessionid = {}
    for i in range(len(session_ids)):
        row_of_sessionid[session_ids[i]] = i

    # create dictionary (k: accomodationId - v: urm col)
    col_of_accomodation = {}
    for i in range(cols_count):
        col_of_accomodation[accomodations_array[i]] = i

    print('dictionaries created\n')

    tqdm.pandas()
    sessions_score = session_groups.progress_apply(
        _session_score_negative_value_seen_elem).values
    print("apply function done\n")

    # create the urm using data indeces and indptr
    _data = []
    indptr = [0]
    indices = []

    values_inserted = 0
    for i in tqdm(range(rows_count)):
        score_dict = sessions_score[i]
        for k in score_dict.keys():
            # TODO: FIND WHY THERE IS A KEY EQUAL -1
            if k != -1:
                indices.append(col_of_accomodation[k])
                _data.append(score_dict[k])
                values_inserted += 1
        indptr.append(values_inserted)
    _urm = sps.csr_matrix((_data, indices, indptr),
                          shape=(rows_count, cols_count))

    print("URM created\n")

    #check if the folder where to save exsist
    cf.check_folder(save_path)

    print('Saving urm matrix... ')
    sps.save_npz('{}/urm_negative_user.npz'.format(save_path), _urm)
    print('done!')

    print('Saving row dictionary... ')
    np.save('{}/dict_row_user.npy'.format(save_path), row_of_sessionid)
    print('done!')

    print('Saving col dictionary... ')
    np.save('{}/dict_col_user.npy'.format(save_path), col_of_accomodation)
    print('done!')
    def loadCSVs(
        self,
        tokenFilename: str = "data_aspects_tokens.csv",
        preprocessedFilename: str = "data_preprocessed.csv",
        lexiconFilename: str = "sentiment_lexicon.csv",
    ) -> bool:
        """
        load all necessary CSV for execution of the detector and set indices as appropriate

        Args:
            tokenFilename (str, optional): Defaults to "data_aspects_tokens.csv".
            preprocessedFilename (str, optional): Defaults to "data_preprocessed.csv".
            lexiconFilename (str, optional): Defaults to "sentiment_lexicon.csv".

        Returns:
            bool: successful execution

        """
        try:
            if self.df_aspect_tokens is None or self.df_aspect_tokens.empty:
                self.df_aspect_tokens = PD.read_csv(self.path + tokenFilename)

                self.df_aspect_tokens["polarity_strength"] = PD.NaT
                self.df_aspect_tokens["polarity_strength"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["sentiment_words"] = PD.NaT
                self.df_aspect_tokens["sentiment_words"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["intensifier_words"] = PD.NaT
                self.df_aspect_tokens["intensifier_words"].fillna(
                    {i: []
                     for i in self.df_aspect_tokens.index}, inplace=True)

                self.df_aspect_tokens["word_found"] = self.df_aspect_tokens[
                    "word_found"].str.replace(r"[^\w]*", "", regex=True)

                # TODO remove after debugging
                # self.df_aspect_tokens = self.df_aspect_tokens[:100]

            if self.df_preprocessed is None or self.df_preprocessed.empty:
                self.df_preprocessed = PD.read_csv(self.path +
                                                   preprocessedFilename)

                # pandas read_csv does not read arrays correctly so we need to adjust those
                tqdm.pandas(desc="Applying Datatype Transformations....")
                self.df_preprocessed["tokens"] = self.df_preprocessed[
                    "tokens"].progress_apply(lambda x: json.loads(x))

            if self.df_lexicon is None or self.df_lexicon.empty:
                if not os.path.exists(self.path + lexiconFilename):
                    self.downloadLexicon()

                self.df_lexicon = PD.read_csv(self.path + lexiconFilename)
                self.df_lexicon.drop_duplicates(subset=["word", "qualifier"],
                                                inplace=True)
                self.df_lexicon.set_index("word", inplace=True)
                self.df_lexicon.drop("%%")

            return True
        except IOError as e:
            print(e)
            return False
"""

import pickle
import os
import pandas as pd
import torch
import spacy
import re
from itertools import permutations
from tqdm import tqdm
from .preprocessing_funcs import load_dataloaders
from ..misc import save_as_pickle

import logging

tqdm.pandas(desc="prog-bar")
logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
                    datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
logger = logging.getLogger('__file__')

def load_pickle(filename):
    completeName = os.path.join(os.getcwd() + "/src/data/",\
                                filename)
    with open(completeName, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    return data

class infer_from_trained(object):
    def __init__(self, args=None, detect_entities=False):
        if args is None:
            self.args = load_pickle("args.pkl")
Example #20
0
 def annotate(self) -> None:
     """
     function to call the "findAspects()" function for every row
     """
     tqdm.pandas(desc="Finding Aspects!")
     self.data.progress_apply(lambda x: self.findAspects(x), axis=1)
Example #21
0
    def run(self):
        tqdm.pandas(tqdm)

        kaggle_train_data = pandas.read_csv(
            os.path.expanduser('~/Datasets/Kaggle-Quora/train.csv')).drop(
                'id', 1)

        a = kaggle_train_data.qid1.apply(
            lambda v: mmh3.hash(str(v).encode('ascii'), 2213) % 128 > 1)
        b = kaggle_train_data.qid2.apply(
            lambda v: mmh3.hash(str(v).encode('ascii'), 6663) % 128 > 1)

        print((a & b).sum(), np.invert(a).sum(), np.invert(b).sum())

        print('Raw training questions')
        kaggle_train_data['question1_raw'] = kaggle_train_data[
            'question1'].fillna('')
        kaggle_train_data['question2_raw'] = kaggle_train_data[
            'question2'].fillna('')
        print('Clean training tokens')
        kaggle_train_data['question1_tokens'] = kaggle_train_data[
            'question1_raw'].progress_apply(clean_text)
        kaggle_train_data['question2_tokens'] = kaggle_train_data[
            'question2_raw'].progress_apply(clean_text)
        assert kaggle_train_data['question1_tokens'].str.len().max(
        ) > 0, 'No tokens 1 found'
        assert kaggle_train_data['question2_tokens'].str.len().max(
        ) > 0, 'No tokens 2 found'
        print('Clean training questions')
        kaggle_train_data['question1_clean'] = kaggle_train_data[
            'question1_tokens'].progress_apply(' '.join)
        kaggle_train_data['question2_clean'] = kaggle_train_data[
            'question2_tokens'].progress_apply(' '.join)

        train_data = kaggle_train_data[a & b].reset_index(drop=True)
        merge_data = kaggle_train_data[np.invert(b)].reset_index(drop=True)
        valid_data = kaggle_train_data[np.invert(a)].reset_index(drop=True)

        print('Writing training data')
        self.output().makedirs()
        train_data.to_msgpack('cache/dataset/train.msg')
        merge_data.to_msgpack('cache/dataset/merge.msg')
        valid_data.to_msgpack('cache/dataset/valid.msg')
        del train_data, valid_data, kaggle_train_data

        kaggle_test_data = pandas.read_csv(
            os.path.expanduser('~/Datasets/Kaggle-Quora/test.csv'))
        print('Raw testing questions')
        kaggle_test_data['question1_raw'] = kaggle_test_data[
            'question1'].fillna('')
        kaggle_test_data['question2_raw'] = kaggle_test_data[
            'question2'].fillna('')
        print('Clean testing tokens')
        kaggle_test_data['question1_tokens'] = kaggle_test_data[
            'question1_raw'].progress_apply(clean_text)
        kaggle_test_data['question2_tokens'] = kaggle_test_data[
            'question2_raw'].progress_apply(clean_text)
        print('Clean testing questions')
        kaggle_test_data['question1_clean'] = kaggle_test_data[
            'question1_tokens'].progress_apply(' '.join)
        kaggle_test_data['question2_clean'] = kaggle_test_data[
            'question2_tokens'].progress_apply(' '.join)
        kaggle_test_data['is_duplicate'] = -1

        kaggle_test_data.to_msgpack('cache/dataset/test.msg')

        with self.output().open('w') as f:
            f.write('done')
Example #22
0
with open(fname, "r") as f:
    for word in f:
        english_long.add(word.strip())

# ## Create lists of stopwords, punctuation, and unicode characters
stop_words_list = stopwords_make(
)  # Define old vocab file path if you want to remove first, dirty elements
unicode_list = unicode_make()
punctstr = punctstr_make()

print("Stopwords, Unicodes, Punctuations lists creation complete!")

#word2vec computation
whole_text_unnested = []
whole_text_nested = []
tqdm.pandas(desc="Cleaning text")

for school in tqdm(df['text'], desc="Cleaning text"):
    doc = []
    for chunk in school.split("\n"):
        for sent in sent_tokenize(chunk):
            sent = clean_sentence_apache(sent,
                                         unhyphenate=True,
                                         remove_propernouns=False,
                                         remove_acronyms=False)
            sent = [word for word in sent if word != '']
            if len(sent) > 0:
                whole_text_unnested.append(sent)
                doc.append(sent)
    whole_text_nested.append(doc)
Example #23
0
def main():
    print("Loading Data...")
    names = ["OFFENSE_TYPE_ID", "OFFENSE_CATEGORY_ID", "FIRST_OCCURRENCE_DATE",
            "REPORTED_DATE", "DISTRICT_ID", "PRECINCT_ID",
            "NEIGHBORHOOD_ID", "IS_CRIME", "IS_TRAFFIC"]
    filePath="crime.csv"
    pd.set_option('display.float_format', '{:.2f}'.format)
    fileExist=False
    if(os.path.exists('crime-treated.csv')):
        fileExist =True
        filePath="crime-treated.csv"
        names =["HOUR_REPORTED","DAY_REPORTED","WEEKDAY_REPORTED","MONTH_REPORTED","YEAR_REPORTED",
                "OFFENSE_CATEGORY_ID","NEIGHBORHOOD_ID"]
        data=pd.read_csv(filePath, parse_dates=True,usecols=names,nrows = None)
    else:
        data=pd.read_csv(filePath, parse_dates=True)

    print("======================================================================DATA INFO======================================")
    data.info()
    print("===================================DATA INFO======================================")
    if(not fileExist):
        print(data.head(5))
        display(data.groupby([data.OFFENSE_CODE,data.OFFENSE_CODE_EXTENSION,data.OFFENSE_TYPE_ID]).size())
        temp=display(data.groupby([data.INCIDENT_ID,data.OFFENSE_CODE,data.OFFENSE_CODE_EXTENSION,data.OFFENSE_TYPE_ID]).size())
        print(temp)

        treatData(data)
    crimesDict = {
        'all-other-crimes': 1,
        'larceny' : 1,
        'theft-from-motor-vehicle' : 3,
        'drug-alcohol' : 2,
        'auto-theft' : 3,
        'white-collar-crime': 1,
        'burglary': 2,
        'public-disorder' : 2,
        'aggravated-assault': 3,
        'other-crimes-against-persons' : 2,
        'robbery' : 3,
        'sexual-assault' : 3,
        'murder': 3,
        'arson': 2
    }
    tqdm.pandas()
    print("Calculating Offense Weigh...")
    data['OFFENSE_WEIGH'] = data.progress_apply(lambda row:  crimesDict[row.OFFENSE_CATEGORY_ID], axis=1 )
    dataCount = data.groupby(['MONTH_REPORTED','WEEKDAY_REPORTED','HOUR_REPORTED','NEIGHBORHOOD_ID']).MONTH_REPORTED.agg('count').to_frame('COUNT').reset_index()
    dataClenad =data.groupby(['MONTH_REPORTED','WEEKDAY_REPORTED','HOUR_REPORTED','NEIGHBORHOOD_ID'], as_index=False).agg({'OFFENSE_WEIGH':'sum'})#['OFFENSE_WEIGH'].sum()['GEO_X'].mean()
    dataClenad['COUNT'] = dataCount['COUNT']
    print(dataClenad)
    print("Calculating Safety ...")
    medianCrime = dataClenad['OFFENSE_WEIGH'].median()
    modeCrime = dataClenad['OFFENSE_WEIGH'].mode().values
    modeQtd = dataClenad['COUNT'].mode().values

    print("CRIME MODE: ",modeCrime)
    dataClenad['SAFETY'] = dataClenad.progress_apply(lambda row: 1 if row.OFFENSE_WEIGH <= modeCrime and row.COUNT <= modeQtd else 0, axis=1 )

    counts = dataClenad['SAFETY'].value_counts()

    # CratGraph(dataClenad)
    ExecuteKNN(dataClenad)
    ExecuteDecisionTree(dataClenad)

    le = LabelEncoder()
    dataClenad = dataClenad.progress_apply(le.fit_transform)
    x_columns = ['MONTH_REPORTED', 'WEEKDAY_REPORTED', 'HOUR_REPORTED', 'NEIGHBORHOOD_ID', 'OFFENSE_WEIGH', 'COUNT']
    y_columns = ['SAFETY']
    x_train, x_test = train_test_split(dataClenad[x_columns], test_size=0.3)
    y_train, y_test = train_test_split(dataClenad[y_columns].values.ravel(), test_size=0.3)

    bagging(5, 200, dataClenad, np.ravel(y_train, order='C'), x_train)
    exit(0)
import re
import sys
import glob
import string
from pprint import pprint
from collections import Counter, OrderedDict

import spacy
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable

import pickle
import math

import gensim.models
Example #25
0
                sep='\t'))

# create a dictionary locations : coordinates

df_loc = df_loc.sort_values(by=['ADM1'])

df_loc = df_loc.drop_duplicates(subset=['FULL_NAME_ND_RO'], keep='first')

loc_dict = dict(zip(df_loc.FULL_NAME_ND_RO, zip(df_loc.LAT, df_loc.LONG)))

loc_dict
import re, string

from tqdm import tqdm

tqdm.pandas(desc="progress")

from tqdm import tqdm_notebook

from fuzzywuzzy import process

from fuzzywuzzy import fuzz

remove_keywords = [
    'nan', 'Planes de Emergencia', 'Cruz Roja', 'Media Luna Roja', 'Guatemala',
    'Argentina', 'Japon', 'Juventud'
]

remove_keywords.extend(
    ['Malawi', 'Lebanon', 'لبنان', 'Nederland', 'Netherlands'])
import pandas as pd
import numpy as np
import csv
import difflib
from tqdm import tqdm
tqdm.pandas(desc="Fuzzy Match Progress")

patent_assignee_mapping = pd.read_csv("./patent_assignee/patent_assignee.tsv",
                                      sep="\t",
                                      low_memory=False)

patent_assignee_mapping.head()
patent_results = pd.read_csv(
    "../Phase-1-Search-Term/green-technology/all-patent-green-terms-searches.csv",
    dtype={
        'Emerging low carbon-Additional energy sources': bool,
        'Emerging low carbon-All-purpose': bool,
        'Emerging low carbon-Alternative fuel': bool,
        'Emerging low carbon-Alternative fuel vehicle': bool,
        'Emerging low carbon-Battery': bool,
        'Emerging low carbon-Building technologies': bool,
        'Emerging low carbon-Carbon capture & storage': bool,
        'Emerging low carbon-Electrochemical processes': bool,
        'Emerging low carbon-Energy management': bool,
        'Environmental-Air pollution': bool,
        'Environmental-All-purpose': bool,
        'Environmental-Biological treatment': bool,
        'Environmental-Contaminated land reclamation & remediation': bool,
        '"Environmental-Environmental monitoring': bool,
        ' instrumentation and analysis"': bool,
        'Environmental-Marine pollution control': bool,
Example #27
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 26 21:20:13 2018

@author: 徐嘉诚
"""
import pandas as pd
import numpy as np
import gc
#import os
import datetime

from sklearn.metrics import mean_squared_error
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

#filelist = os.listdir('../data')


def get_beautiful_test(test):
    test_rnd = np.round(test.iloc[:, 1:], 2)
    ugly_indexes = []
    non_ugly_indexes = []
    for idx in tqdm(range(len(test))):
        if not np.all(test_rnd.iloc[idx, :].values == test.iloc[idx,
                                                                1:].values):
            ugly_indexes.append(idx)
        else:
            non_ugly_indexes.append(idx)
    print(len(ugly_indexes), len(non_ugly_indexes))
    np.save('test_ugly_indexes', np.array(ugly_indexes))
Example #28
0
import torch
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import cv2
from tqdm import tqdm
import multiprocessing as mp
from preprocessing import read_images
from prototypicalNet import PrototypicalNet, train_step, test_step
tqdm.pandas(desc="my bar!")


def main():
    trainx, trainy = read_images('../input/omniglot/images_background/')
    testx, testy = read_images('../input/omniglot/images_evaluation/')
    use_gpu = torch.cuda.is_available()
    trainx = torch.from_numpy(trainx).float()
    testx = torch.from_numpy(testx).float()
    if use_gpu:
        trainx = trainx.cuda()
        testx = testx.cuda()
    print(trainx.size(), testx.size())
    num_episode = 16000
    frame_size = 1000
    trainx = trainx.permute(0, 3, 1, 2)
    testx = testx.permute(0, 3, 1, 2)
    frame_loss = 0
    frame_acc = 0
    for i in range(num_episode):
        loss, acc = train_step(trainx, trainy, 5, 60, 5)
        frame_loss += loss.data
Example #29
0
def urm_session_aware(mode,
                      action_score_dict,
                      cluster='no_cluster',
                      time_weight='lin'):
    """
    Create the URM considering the whole session of a user and giving scores based on its interactions

    :param train_df:
    :param test_df:
    :param time_weight:
    :param save_path:
    :param save:
    :return:
    """
    global tw
    tw = time_weight
    save_path = 'dataset/preprocessed/{}/{}/matrices/'.format(cluster, mode)

    accomodations_array = data.accomodations_ids()

    # load the dataframes according to the mode and cluster
    train_df = data.train_df(mode=mode, cluster=cluster)
    test_df = data.test_df(mode=mode, cluster=cluster)

    # fill missing clickout_item on the test dataframe
    test_df.fillna({'reference': -1}, inplace=True)
    train_df.fillna({'reference': -1}, inplace=True)

    # concatenate the train df and the test df mantaining only the columns of interest
    df = pd.concat([train_df, test_df])[[
        'session_id', 'user_id', 'action_type', 'reference', 'impressions'
    ]]

    session_groups = df.groupby(['session_id', 'user_id'])
    session_ids = list(session_groups.groups.keys())

    rows_count = len(session_groups)
    cols_count = len(accomodations_array)

    # create dictionary (k: sessionId - v: urm row)
    row_of_sessionid = {}
    for i in range(len(session_ids)):
        row_of_sessionid[session_ids[i]] = i

    # create dictionary (k: accomodationId - v: urm col)
    col_of_accomodation = {}
    for i in range(cols_count):
        col_of_accomodation[accomodations_array[i]] = i

    print('dictionaries created\n')

    tqdm.pandas()
    sessions_score = session_groups.progress_apply(
        _compute_session_score).values
    print("apply function done\n")

    # create the urm using data indeces and indptr
    _data = []
    indptr = [0]
    indices = []

    values_inserted = 0
    for i in tqdm(range(rows_count)):
        score_dict = sessions_score[i]
        for k in score_dict.keys():
            indices.append(col_of_accomodation[k])
            _data.append(score_dict[k])
            values_inserted += 1
        indptr.append(values_inserted)
    _urm = sps.csr_matrix((_data, indices, indptr),
                          shape=(rows_count, cols_count))

    print("URM created\n")

    #check if the folder where to save exsist
    cf.check_folder(save_path)

    print('Saving urm matrix... ')
    sps.save_npz('{}/urm_session_aware1_{}.npz'.format(save_path, time_weight),
                 _urm)
    print('done!')

    print('Saving row dictionary... ')
    np.save('{}/dict_row.npy'.format(save_path), row_of_sessionid)
    print('done!')

    print('Saving col dictionary... ')
    np.save('{}/dict_col.npy'.format(save_path), col_of_accomodation)
    print('done!')
Example #30
0
import pandas as pd
import pandas_datareader as web
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

from tqdm import tqdm
tqdm.pandas()  # need to run in notebook

import warnings
warnings.filterwarnings(action='ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#-----------sklearn----------------------------------------------------

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

#-----------other_models---------------------------------------------------

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#-----------misc----------------------------------------------------

import gc
from datetime import datetime, timedelta
Example #31
0
    def create_urm(self):

        # load the dataframes according to the mode and cluster
        train_df = data.train_df(mode=self.mode, cluster=self.cluster)
        test_df = data.test_df(mode=self.mode, cluster=self.cluster)

        # fill missing clickout_item on the test dataframe
        test_df.fillna({'reference': -1}, inplace=True)
        train_df.fillna({'reference': -1}, inplace=True)

        # concatenate the train df and the test df mantaining only the columns of interest
        df = pd.concat([train_df, test_df])[[
            'session_id', 'user_id', 'action_type', 'reference', 'impressions'
        ]]

        if self.type == 'user':
            session_groups = df.groupby(['user_id'])
        if self.type == 'session':
            session_groups = df.groupby(['user_id', 'session_id'])

        # it coincides with the rows number
        groups_keys = list(session_groups.groups.keys())

        rows_count = len(groups_keys)
        cols_count = len(self.accomodations_id)
        """
        create ROW dictionary
            if type == USER :
                key: user_id -- value: row_urm
            if type == SESSION :
                key: (user_id, session_id) -- value: row_urm
        """
        row_dict = {}
        for i in range(rows_count):
            row_dict[groups_keys[i]] = i
        """
        create COL dictionary
            key: accomodation_id -- value: col_urm
        """
        col_dict = {}
        for i in range(cols_count):
            col_dict[self.accomodations_id[i]] = i

        print('dictionaries created\n')

        tqdm.pandas()
        # compute the score
        sessions_score = session_groups.progress_apply(
            self._compute_session_score).values

        print("apply function done\n")

        # create the urm using data indeces and indptr
        _data = []
        indptr = [0]
        indices = []

        values_inserted = 0
        for i in tqdm(range(rows_count)):
            score_dict = sessions_score[i]
            for k in score_dict.keys():
                indices.append(col_dict[k])
                _data.append(score_dict[k])
                values_inserted += 1
            indptr.append(values_inserted)
        _urm = sps.csr_matrix((_data, indices, indptr),
                              shape=(rows_count, cols_count))

        print("URM created\n")

        print('Saving urm matrix... ')
        sps.save_npz(f'{self.save_path}/{self.name}.npz', _urm)
        print('done!')

        print('Saving row dictionary... ')
        np.save(f'{self.save_path}/{self.name}_dict_row.npy', row_dict)
        print('done!')

        print('Saving col dictionary... ')
        np.save(f'{self.save_path}/{self.name}_dict_col.npy', col_dict)
        print('done!')
Example #32
0
    'fuzz_token_sort_ratio_question': fuzz_token_sort_ratio_question,
    'fuzz_token_sort_ratio_lemma1': fuzz_token_sort_ratio_lemma1,
    'fuzz_token_sort_ratio_lemma2': fuzz_token_sort_ratio_lemma1,

    'fuzz_token_set_ratio_question': fuzz_token_set_ratio_question,
    'fuzz_token_set_ratio_lemma1': fuzz_token_set_ratio_lemma1,
    'fuzz_token_set_ratio_lemma2': fuzz_token_set_ratio_lemma2
}

# read train dataframe
df_train = pd.read_csv('../../data/Quora_Question_Pairs/middle/train_lemma_stem.csv')

# implement function from calc_table to df_train
for name, fun in tqdm(calc_table.items()):
    tqdm.pandas(desc=name)
    df_train[name] = df_train.progress_apply(fun, axis=1)

# save result and delete df_train
df_train.to_csv("../../data/Quora_Question_Pairs/middle/train_lemma_stem.csv")
del df_train

# read test dataframe
df_test = pd.read_csv('../../data/Quora_Question_Pairs/middle/test_lemma_stem.csv')

# implement function from calc_table to df_train
for name, fun in tqdm(calc_table.items()):
    tqdm.pandas(desc=name)
    df_test[name] = df_test.progress_apply(fun, axis=1)

# save result df_test
Example #33
0
def _createNewTrainingSetWithFeatureVariations(basicDf, newFeatureDf,
                                               featureOfInterest,
                                               variation_degree):
    import pandas as pd
    import numpy as np

    from tqdm import tqdm
    from utilities.pandasTools import suffixColumnsWithLabel

    try:
        # Create and register a new `tqdm` instance with `pandas`
        # (can use tqdm_gui, optional kwargs, etc.)
        tqdm.pandas()

        print(
            '_createNewTrainingSetWithFeatureVariations check point 1 >>> variation_degree >>> '
            + str(variation_degree))

        featureVariants = [[
            np.exp(
                suffixColumnsWithLabel(newFeatureDf, '_exp_' + str(iterator)) *
                iterator),
            np.exp(
                suffixColumnsWithLabel(newFeatureDf, '_exp_inv_' +
                                       str(iterator)) * iterator * -1),
            np.power(
                suffixColumnsWithLabel(newFeatureDf, '_pow_' + str(iterator)),
                iterator),
            np.power(
                suffixColumnsWithLabel(newFeatureDf, '_pow_inv_' +
                                       str(iterator)).astype(float),
                iterator * -1)
        ] for iterator in range(1, variation_degree + 1)]

        print('_createNewTrainingSetWithFeatureVariations check point 2')
        segmentCount, rowCount, colCount = len(featureVariants), len(
            featureVariants[0]), len(featureVariants[0][0])

        cummulativeListOfFeatures = np.empty(segmentCount, dtype=list)
        print('_createNewTrainingSetWithFeatureVariations check point 3')

        for segmentItr in range(0, segmentCount - 1):
            cummulativeListOfFeatures[segmentItr] = pd.DataFrame([])
            for rowItr in range(0, rowCount - 1):
                cummulativeListOfFeatures[segmentItr] = pd.concat([
                    cummulativeListOfFeatures[segmentItr],
                    featureVariants[segmentItr][rowItr]
                ],
                                                                  axis=1)
        print('_createNewTrainingSetWithFeatureVariations check point 4')

        cummulativeListOfFeatures = pd.concat(cummulativeListOfFeatures,
                                              axis=1)

        newTrainingSetDf = pd.concat(
            [basicDf, newFeatureDf, cummulativeListOfFeatures], axis=1)
        print('_createNewTrainingSetWithFeatureVariations check point 5')

        return newTrainingSetDf
    except:
        print("Error executing method >>> ")
        # exc_type, exc_obj, exc_tb = sys.exc_info()
        # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        # print("Unexpected error:", sys.exc_info())
        # print(exc_type, fname, exc_tb.tb_lineno)

        # http://docs.python.org/2/library/sys.html#sys.exc_info
        exc_type, exc_value, exc_traceback = sys.exc_info(
        )  # most recent (if any) by default
        '''
        Reason this _can_ be bad: If an (unhandled) exception happens AFTER this,
        or if we do not delete the labels on (not much) older versions of Py, the
        reference we created can linger.

        traceback.format_exc/print_exc do this very thing, BUT note this creates a
        temp scope within the function.
        '''

        traceback_details = {
            'filename': exc_traceback.tb_frame.f_code.co_filename,
            'lineno': exc_traceback.tb_lineno,
            'name': exc_traceback.tb_frame.f_code.co_name,
            'type': exc_type.__name__,
            'message': traceback.extract_tb(exc_traceback)
        }

        del (exc_type, exc_value, exc_traceback
             )  # So we don't leave our local labels/objects dangling
        # This still isn't "completely safe", though!
        # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback
        # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]

        print
        print(traceback.format_exc())
        print
        print(traceback_template % traceback_details)
        print

        #traceback.print_exception()
        raise
Example #34
0
def load_time_series_dataset(cfg, slide=None):
    '''
    Load the static and time series data from disk and join them. Create time series examples to form large dataset with
    time series and static features. Partition into train/val/test sets. Normalize numerical data.
    :param cfg: Project config (from config.yml)
    :param slide: Int that controls how many recent dates to cut off from the dataset
    :return: A dict of partitioned and normalized datasets, split into examples and labels
    '''

    # Load data info generated during preprocessing
    data = {}
    data['METADATA'] = {}
    input_stream = open(cfg['PATHS']['DATA_INFO'], 'r')
    data_info = yaml.full_load(input_stream)
    data['METADATA']['N_WEEKS'] = data_info['N_WEEKS']
    noncat_features = data_info[
        'NON_CAT_FEATURES']  # Noncategorical features to be scaled
    T_X = cfg['DATA']['TIME_SERIES']['T_X']
    tqdm.pandas()

    # Load data (before and after one-hot encoding)
    df_ohe = pd.read_csv(cfg['PATHS']['PROCESSED_OHE_DATA'])
    df = pd.read_csv(cfg['PATHS']['PROCESSED_DATA']
                     )  # Static and dynamic data prior to one hot encoding
    time_series_feats = [
        f for f in df.columns if ('-Day_' in f) and (')' not in f)
    ]

    # Partition dataset by date
    unique_dates = np.flip(df_ohe['Date'].unique()).flatten()
    val_split = cfg['TRAIN']['VAL_SPLIT']
    if val_split * unique_dates.shape[0] < 1:
        val_split = 1.0 / unique_dates.shape[
            0]  # Ensure validation set contains records from at least 1 time step
        print("Val set split in config.yml is too small. Increased to " +
              str(val_split))
    test_split = cfg['TRAIN']['TEST_SPLIT']
    if test_split * unique_dates.shape[0] < 1:
        test_split = 1.0 / unique_dates.shape[
            0]  # Ensure test set contains records from at least 1 time step
        print("Test set split in config.yml is too small. Increased to " +
              str(test_split))
    if slide is None:
        test_df_dates = unique_dates[-int(test_split * unique_dates.shape[0]):]
        val_df_dates = unique_dates[-int(
            (test_split + val_split) *
            unique_dates.shape[0]):-int(test_split * unique_dates.shape[0])]
        train_df_dates = unique_dates[0:-int((test_split + val_split) *
                                             unique_dates.shape[0])]
    else:
        test_split_size = max(int((test_split) * unique_dates.shape[0]), 1)
        val_split_size = max(int((val_split) * unique_dates.shape[0]), 1)
        offset = slide * test_split_size
        if offset == 0:
            test_df_dates = unique_dates[-(test_split_size):]
        else:
            test_df_dates = unique_dates[-(test_split_size + offset):-offset]
        val_df_dates = unique_dates[-(val_split_size + test_split_size +
                                      offset):-(test_split_size + offset)]
        train_df_dates = unique_dates[0:-(val_split_size + test_split_size +
                                          offset)]

    train_df_ohe = df_ohe[df_ohe['Date'].isin(train_df_dates)]
    val_df_ohe = df_ohe[df_ohe['Date'].isin(val_df_dates)]
    test_df_ohe = df_ohe[df_ohe['Date'].isin(test_df_dates)]
    train_df = df[df['Date'].isin(train_df_dates)]
    test_df = df[df['Date'].isin(test_df_dates)]
    print('Train set size = ' + str(train_df_ohe.shape[0]) +
          '. Val set size = ' + str(val_df_ohe.shape[0]) +
          '. Test set size = ' + str(test_df_ohe.shape[0]))

    # Save train & test set for LIME
    train_df.to_csv(cfg['PATHS']['TRAIN_SET'],
                    sep=',',
                    header=True,
                    index=False)
    test_df.to_csv(cfg['PATHS']['TEST_SET'], sep=',', header=True, index=False)

    # Anonymize clients
    train_df_ohe.drop(['ClientID', 'Date'], axis=1, inplace=True)
    val_df_ohe.drop(['ClientID', 'Date'], axis=1, inplace=True)
    test_df_ohe.drop(['ClientID', 'Date'], axis=1, inplace=True)

    # Get indices of noncategorical features
    noncat_feat_idxs = [
        test_df_ohe.columns.get_loc(c) for c in noncat_features
        if c in test_df_ohe
    ]

    # Separate ground truth from dataframe and convert to numpy arrays
    data['Y_train'] = np.array(train_df_ohe.pop('GroundTruth'))
    data['Y_val'] = np.array(val_df_ohe.pop('GroundTruth'))
    data['Y_test'] = np.array(test_df_ohe.pop('GroundTruth'))

    # Convert feature dataframes to numpy arrays
    data['X_train'] = np.array(train_df_ohe)
    data['X_val'] = np.array(val_df_ohe)
    data['X_test'] = np.array(test_df_ohe)

    # Normalize numerical data and save the scaler for prediction.
    col_trans_scaler = ColumnTransformer(transformers=[
        ('col_trans_ordinal', StandardScaler(), noncat_feat_idxs)
    ],
                                         remainder='passthrough')
    data['X_train'] = col_trans_scaler.fit_transform(
        data['X_train'])  # Only fit train data to prevent data leakage
    data['X_val'] = col_trans_scaler.transform(data['X_val'])
    data['X_test'] = col_trans_scaler.transform(data['X_test'])
    dump(col_trans_scaler,
         cfg['PATHS']['SCALER_COL_TRANSFORMER'],
         compress=True)

    data['METADATA']['NUM_TS_FEATS'] = len(
        time_series_feats)  # Number of different time series features
    data['METADATA']['T_X'] = T_X
    return data
Example #35
0
"""
@author - Mohsin
"""
import numpy as np
np.random.seed(786)  # for reproducibility
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

tqdm.pandas(tqdm)

from utils import *
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

if __name__ == "__main__":
    LOGGER_FILE = "prepImageFeature3.log"

    IMAGE_FILE_1 = "../utility/df_image_feats3.csv"

    ######################   Logger   #########################################
    handler = logging.FileHandler(LOGGER_FILE)
    handler.setLevel(logging.INFO)

    # create a logging format
    formatter = logging.Formatter(
Example #36
0
def get_variant_data(connection, limit=0, testRun = False):
    '''
    Get Variant data for Solr document.
    '''

    # Special variant IDs that represent 
    testAssociationId = [
        42893, # Missing from solr slim
        47195, # kgp ID
    ]

    # function to retrieve further data from the database:
    def get_more_variant_data(row):

        # Extracting basic variant information:
        resourcename = 'variant'
        ID = row['ID']
        rsID = row['RS_ID']
        consequence = str(row['FUNCTIONAL_CLASS']).replace("_", " ").capitalize() 

        # Extracting association count:
        association_count = variant_cls.get_association_count(ID)

        # Extracting study count:
        study_count = variant_cls.get_study_count(ID)

        # We don't care about variants that have no associations:
        if association_count == 0: 
            return(1)

        # Extracting genomic location:
        location = variant_cls.get_variant_location(ID)

        # Extracting mapped genes:
        mapped_genes_list = variant_cls.get_mapped_genes(ID)
        mapped_genes_names = [x.split("|")[0] for x in mapped_genes_list]
        mapped_genes_names = list(set(mapped_genes_names)) # GOCI-2475 - unique set of names are generated.

        # Extracting merged rsID:
        current_rsID = variant_cls.get_current_rsID(ID)

        # Assign merged rsID and generate title:
        title = ''
        if current_rsID:
            merged_rsID = rsID
            title = "%s (%s)" %(current_rsID, merged_rsID)
        else: 
            current_rsID = rsID
            merged_rsID = ''
            title = current_rsID
        
        # Combining data into a dictionary:
        varDoc = {
            'resourcename' : resourcename,
            'id' : "%s:%s" % (resourcename,ID),
            'title' : title,
            'rsID' : rsID,
            'current_rsID' : current_rsID,
            'merged_rsID' : merged_rsID,
            'associationCount' : association_count,
            'studyCount' : study_count,
            'mappedGenes' : mapped_genes_list,
            'consequence' : consequence,
        }

        # Adding only valid location indicated by integer position:
        if isinstance(location['position'], int):
            varDoc['chromosomeName'] = location['chromosome']
            varDoc['chromosomePosition'] = location['position']
            varDoc['region'] = location['region']

        # Adding description to the document:
        coordinates = '%s:%s' %(location['chromosome'], location['position'])
        genes_str = ",".join(mapped_genes_names)
        varDoc['description'] =  "|".join([coordinates, 
            str(location['region']), consequence,genes_str])
        
        # Adding to document list:
        all_variant_data.append(varDoc)

    # Initialize empty list for the documents:
    all_variant_data = []

    # Step 1: initialize variant object:
    variant_cls = variant_sqls(connection)

    # Step 2: retrieve all the variants in the database:
    variants_df = variant_cls.get_snps()

    # Inintialize progress bar:
    tqdm.pandas(desc="Returning variant data")

    # Step 3: Calling apply to retrieve all variant data:
    if limit != 0:
        variants_df[0:limit].progress_apply(get_more_variant_data, axis = 1)
    elif testRun:
        variants_df[variants_df['ID'].isin(testAssociationId)].progress_apply(get_more_variant_data, axis = 1)
    else:
        variants_df.progress_apply(get_more_variant_data, axis = 1)

    return all_variant_data
        wordcount[word] += 1

# function for replace every word in the lemmatized message with the most popular synonim
def build_new_sent(sent):
    list_sent = []
    for word in str(sent).split():
        word_d = dict()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemma_names():
                word_d[lemma] = wordcount[lemma]
        if len(word_d.items()):
            list_sent.append(max(word_d.items(), key=operator.itemgetter(1))[0])
        else:
            list_sent.append(word)

    return ' '.join(list_sent)

# implement build_new_sent function to the df_train and save results
tqdm.pandas("lemma_q1")
df_train['lemma_q1_new'] = df_train['lemma_q1'].progress_apply(build_new_sent)
tqdm.pandas("lemma_q2")
df_train['lemma_q2_new'] = df_train['lemma_q2'].progress_apply(build_new_sent)
df_train.to_csv("../middle/train_lemma_stem.csv", index=False)

# implement build_new_sent function to the df_test and save results
tqdm.pandas("lemma_q1")
df_test['lemma_q1_new'] = df_test['lemma_q1'].progress_apply(build_new_sent)
tqdm.pandas("lemma_q2")
df_test['lemma_q2_new'] = df_test['lemma_q2'].progress_apply(build_new_sent)
df_test.to_csv("../middle/test_lemma_stem.csv", index=False)
Example #38
0
def main():

    # open
    df_train = pd.read_csv('../middle/train_lemma_stem.csv')
    df_test = pd.read_csv('../middle/test_lemma_stem.csv')

    # drop None
    df_train = df_train.replace(np.nan, ' ', regex=True)
    df_test = df_test.replace(np.nan, ' ', regex=True)

    # tuple of input and output columns names
    name_tuple = [('lemma_q1', 'lemma1doc'), ('lemma_q2', 'lemma2doc'), ('lemma_q1_new', 'lemma1doc_syn'), ('lemma_q2_new', 'lemma2doc_syn')]

    # apply spacy_doc function for columns from name_tuple to df_train
    for t in name_tuple:
        tqdm.pandas(desc=t[0]+'train')
        df_train[t[1]] = df_train[t[0]].progress_apply(spacy_doc)

    # find similarity in df_train
    tqdm.pandas(desc='spacy_similarity_lem+train')
    df_train['spacy_sim_lem'] = df_train.progress_apply(spacy_similarity_lem, axis=1)
    tqdm.pandas(desc='spacy_similarity_syn+train')
    df_train['spacy_sim_syn'] = df_train.progress_apply(spacy_similarity_syn, axis=1)

    df_train.to_csv('../middle/train_lemma_stem.csv')

    # apply spacy_doc function for columns from name_tuple to df_test
    for t in name_tuple:
        tqdm.pandas(desc=t[0]+'test')
        df_test[t[1]] = df_test[t[0]].progress_apply(spacy_doc)

    # find similarity in df_test
    tqdm.pandas(desc='spacy_similarity_lem+test')
    df_test['spacy_sim_lem'] = df_test.progress_apply(spacy_similarity_lem, axis=1)
    tqdm.pandas(desc='spacy_similarity_syn+test')
    df_test['spacy_sim_syn'] = df_test.progress_apply(spacy_similarity_syn, axis=1)

    df_test.to_csv('../middle/test_lemma_stem.csv', index=False)
Example #39
0
def mapUniProt(geneIndexFile):
    #fills in the uniProt column of the geneIndex dataframe by calling the uniprot mapping server
    tqdm.pandas(desc="mapping entrez to uniProt")
    indexFrame = pd.read_pickle(geneIndexFile)
    indexFrame['uniProt'] = indexFrame['Entrez_Gene_Id'].progress_apply(lambda entrez: getUniprot(entrez))
#     Keywords                      X

# Legend:
#     X = Complete
#     V = Currently Void
#     P = Partially Stabalized

# ------------------------------------------------------------------------------------------------------------
# United Kingdom
# ------------------------------------------------------------------------------------------------------------

os.chdir(MAIN_FOLDER + "/Data/Governmental_Science_Funding/UK")
uk_df = pd.read_csv("uk_funding_data.csv")

# Start tqdm
tqdm.pandas(desc="status")

# Limit to Research Grants
uk_df = uk_df[uk_df['ProjectCategory'] == 'Research Grant'].reset_index(drop=True)

# Drop unneeded columns
to_drop = ['ProjectReference',
           'EndDate',
           'Status',
           'PIOtherNames',
           'PI ORCID iD',
           'StudentSurname',
           'StudentFirstName',
           'StudentOtherNames',
           'Student ORCID iD',
           'GTRProjectUrl',
Example #41
0
#    'pos_max': negative sentiment (strongest sentiment statement)',
#    'comp_max': composite sentiment (strongest sentiment statement)',

# Import dependencies
from statistics import mean
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sent_tokenize
import numpy as np
from tqdm import tqdm
import collections

# load data
review = pd.read_csv('C:/Users/adamb/Desktop/vader/yelp_review.csv')

tqdm.pandas()  # progress bar
analyser = SentimentIntensityAnalyzer()  # create analyser


# Function to get scores
def sentiment_analyzer_scores(text):
    """Calculates sentiment scores for each text review
    
    Parameters
    ----------
    text : str
        Text to analyze

    Returns
    ----------
    list of float:
from molmaps import distances, calculator, summary
import pandas as pd
import numpy as np
from rdkit import Chem
from tqdm import tqdm
tqdm.pandas(ascii=True)


def caldis(data, idx, tag, methods=['correlation', 'cosine', 'jaccard']):

    ##############################################################
    Nf = len(feature.fingerprint.Extraction().bitsinfo)
    data0 = loadnpy('./data/fingerprint_8206960.npy', N=Nf, dtype=np.bool)
    groups = data0.sum(axis=1)
    from sklearn.model_selection import GroupKFold
    G = GroupKFold(n_splits=10)
    sp = G.split(X=data0, groups=groups)
    spl = list(sp)
    sidx = spl[0][1]
    del data0
    print(len(sidx))

    data = data[sidx]
    data = data.astype(np.float32, copy=False)
    #############################################################

    for method in methods:
        res = calculator.pairwise_distance(data, n_cpus=16, method=method)
        res = np.nan_to_num(res, copy=False)
        df = pd.DataFrame(res, index=idx, columns=idx)
        df = df.astype('float32')
Example #43
0
import random

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize.regexp import regexp_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

tqdm.pandas(desc="progress-bar")
import re

from gensim.models import Doc2Vec, doc2vec
from sklearn import utils
from sklearn.utils import shuffle

import classifierutils
import dataread

headers = dataread.read_file('top_sectionheaders_5000.txt')

results = []

for header in headers:
    ret_val = []
    for item in header_corpus[header]['labelled_tokenised'].TEXT:
        logreg.predict([model_dbow.infer_vector(item)])
Example #44
0
def apply_preprocessing(df):
    tqdm.pandas()
    df['reviewClean_sw'], df['noun_adjective'] = zip(
        *df['reviewText'].progress_apply(text_preprocessing))
    return df
Example #45
0
import pandas as pd
import numpy as np
from tqdm import tqdm

df = pd.DataFrame(np.random.randint(0, 100, (100000, 6)))

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="my bar!")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
df.progress_apply(lambda x: x**2)
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

# -- Source code for `tqdm_pandas` (really simple!)
# def tqdm_pandas(t):
#   from pandas.core.frame import DataFrame
#   def inner(df, func, *args, **kwargs):
#       t.total = groups.size // len(groups)
#       def wrapper(*args, **kwargs):
#           t.update(1)
#           return func(*args, **kwargs)
#       result = df.apply(wrapper, *args, **kwargs)
#       t.close()
#       return result
#   DataFrame.progress_apply = inner
Example #46
0
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

from feature_engineering.tools import lit_eval_nan_proof

# this script computes the features authors_citation and coauthor score by considering the graph of coauthorship and
# the author's graph of citations.
# the script takes approximately 5 minutes to run

# progress bar for pandas
tqdm.pandas(tqdm())

# path
path_to_data = "data/"

# loading data
converter_dict = {
    'authors': lit_eval_nan_proof,
    'journal': lit_eval_nan_proof,
    'title': lit_eval_nan_proof,
    'abstract': lit_eval_nan_proof
}
nodes = pd.read_csv(path_to_data + "nodes_preprocessed.csv",
                    converters=converter_dict)
nodes.set_index("id", inplace=True)
training = pd.read_csv(path_to_data + "training_features.txt")
training.set_index("my_index", inplace=True)
testing = pd.read_csv(path_to_data + "testing_features.txt")
testing.set_index("my_index", inplace=True)
Example #47
0
import numpy as np
import pandas as pd
pd.set_option("display.max_columns",1000) # don’t put … instead of multi columns
pd.set_option('expand_frame_repr',False) # for not wrapping columns if you have many

from optparse import OptionParser
from tqdm import tqdm
tqdm.pandas()
from sys import platform


import plotly as py
import cufflinks

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')
import int_force

if __name__ == '__main__':
    timer=int_force.global_imports.timer()
    help_text='''
    examples:
        seq 0 40 |xargs -I ^ echo python3 "%prog" -s ^ \&
        sbatch --mem=1800m -c1 --time=0:50:0 --array=0-399 --wrap 'python3 %prog -s ${SLURM_JOB_ID}_${SLURM_ARRAY_TASK_ID}'
        sbatch --mem=1800m -c1 --time=0:50:0 --array=0-199 --wrap 'python3 %prog -s ${SLURM_JOB_ID}_${SLURM_ARRAY_TASK_ID} -q "[0,3,150]" -b "[5,17,19,10001]" -m 15'
    '''
    parser = OptionParser(usage=help_text, version="%prog 1.0 beta")
    parser.add_option("-n", dest="samples", type="int", default=1000, help='number of dots X2 because you have x and y. for example 1000. you better use 5 [default: %default]')
    parser.add_option("-s", dest="split_id", type="str", default='0', help='the split unique id so it will not override old output [default: %default]')
Example #48
0
from timeit import default_timer as timer
import pandas as pd
import numpy as np
from umap import UMAP
from ivis import Ivis
from evaluation import Doc2VecModel
from tqdm import tqdm
from pyod.models.ocsvm import OCSVM
from pyod.models.hbos import HBOS
from pyod.models.pca import PCA
from itertools import permutations
from utils import next_path, product_dict, get_scores, reject_outliers, sample_data, remove_short_texts



tqdm.pandas(desc="progess: ")


class IQROutlier:
    def __init__(self, contamination=0.1):
        self.contamination = contamination

    def fit(self, X, y=None):
        pcnt = self.contamination / 2
        qlow, self.median, qhigh = np.quantile(X, [pcnt, 0.50, 1-pcnt])
        self.iqr = qhigh - qlow
        return self

    def transform(self, X, thresh_factor=1.0):
        iqr = self.iqr*thresh_factor
        preds = ((np.abs(X - self.median)) >= iqr/2)