def main():
    business_data = pd.DataFrame.from_csv('processed_data/business_data.csv', index_col=None)
    checkin_data = pd.DataFrame.from_csv('processed_data/checkin_data.csv', index_col=None)
    tip_data = pd.DataFrame.from_csv('processed_data/tip_data.csv', encoding="latin_1", index_col=None)
    review_data = pd.DataFrame.from_csv('processed_data/review_data.csv', encoding="latin_1", index_col=None)
    training_data = pd.DataFrame.from_csv('data/train_labels.csv', index_col=None)

    # Convert training date to seconds for easier maths
    training_data.ix[:, 'date'] = training_data.ix[:, 'date'].apply(functions.date_to_seconds).astype('int32')

    # Figure out the names of the TFIDF feature columns
    feature_re = re.compile(r'^[tr]\.')
    tip_features = [col for col in tip_data.columns if feature_re.match(col) is not None]
    review_features = [col for col in review_data.columns if feature_re.match(col) is not None]

    # Add restaurant IDs to everything
    id_dict = functions.build_restaurant_id_map('data/restaurant_ids_to_yelp_ids.csv')
    map_to_boston_ids = lambda yid: id_dict[yid] if yid in id_dict else np.nan

    for df in [business_data, checkin_data, tip_data, review_data]:
        df['restaurant_id'] = df['business_id'].map(map_to_boston_ids)
        df.drop(['business_id'], axis=1, inplace=True)
        # drop those businesses that are not in the Boston dataset
        df.dropna(axis=0, subset=['restaurant_id'], inplace=True)

    # FIXME Do something other than average the values across matching restaurant IDs?
    # This works because the only non-numeric fields are restaurant_id and business_id.
    # restaurant_id becomes the new index after grouping, so it is also not a normal column.
    business_data = business_data.groupby('restaurant_id', sort=False).mean()
    checkin_data = checkin_data.groupby('restaurant_id', sort=False).mean()

    # Sum the TFIDF features for everything that comes before in time for a given tip or review
    # FIXME maybe don't drop userid?
    tip_data.reset_index(inplace=True)
    tip_data.drop(['user_id', 'text'], axis=1, inplace=True)
    review_data.reset_index(inplace=True)
    review_data.drop(['text'], axis=1, inplace=True)

    # Finally, join everything
    training_data = create_evaluation_data(business_data, checkin_data, tip_data, review_data,
                                           tip_features, review_features, training_data)
    training_data.to_csv("processed_data/training_data.csv", index=None)

    # And join the submission data
    submission_data_1 = pd.from_csv('data/SubmissionFormat.csv', index_col=None)
    submission_data_1.ix[:, 'date'] = submission_data_1.ix[:, 'date'].apply(functions.date_to_seconds).astype('int32')
    submission_data_1 = create_evaluation_data(business_data, checkin_data, tip_data, review_data,
                                               tip_features, review_features, submission_data_1)
    submission_data_1.to_csv("processed_data/submission_data.csv", index=None)

    submission_data_2 = pd.from_csv('data/PhaseIISubmissionFormat.csv', index_col=None)
    submission_data_2.ix[:, 'date'] = submission_data_2.ix[:, 'date'].apply(functions.date_to_seconds).astype('int32')
    submission_data_2 = create_evaluation_data(business_data, checkin_data, tip_data, review_data,
                                               tip_features, review_features, submission_data_2)
    submission_data_2.to_csv("processed_data/phase2_data.csv", index=None)

    return
Exemple #2
0
def read_dataframe_from_csv(fname):
    """Read dataframe from csv file

        Read dataframe from csv file

        Args:
            fname(str): Csv filename
    """
    return pd.from_csv(fname)
    def dataset_from_csv(self, filename, time_column='point_in_time'):
        """
        Read dataset from csv file

        filename : str
                   filename to use

        return DataFrame
        """
        return pd.from_csv(filename, parse_dates=[time_column])
Exemple #4
0
    def load_data(self):
        """
        Load the data from the ``.tsv`` files in the ``self.df_file`` 
        dictionary.

        The optional *keys* parameter is a list of types of counts to be 
        loaded. By default, all counts are loaded.
        """
        if keys is None:
            keys = self.df_file.keys()
        else:
            if not all(key in self.df_file.keys() for key in keys):
                raise EnrichError("Cannot load unsaved counts", self.name)
        for key in keys:
            self.counts[key] = pd.from_csv(self.df_file[key], sep="\t")
def get_review_exp(owner, project_name, commit_id, author):
    pr = get_pull_request_for_commit(owner, project_name, commit_id)
    rcount=0
    pr_exp = []
    last_max_pr = 20000
    pr_exp_df_old= None
    if len(pr)!=0:
        number = int(pr['number'])
        # check if the project already exist
        if f'{project_name}.csv' in os.listdir('exp_data'):
            # check if the last max pr was higher than the current one
            pritn('file exists')
            pr_exp_df_old = pd.from_csv(open(f'exp_data/{project_name}.csv'))
            last_pr = pr_exp_df_old.pr.max()
            if number <=last_pr:
                # return the last exp
                rcount = pr_exp_df_old[pr_exp_df_old.pr<=number].exp.values[0]
                return rcount
            # else set the end point to the last max pr
            else:
                last_max_pr = pr_exp_df_old.pr.values[0]
                rcount = pr_exp_df_old.exp.values[0]
        else: 
            print('starting a new project')
        while(number>last_max_pr):
            rnames = get_reviewer_names(number)

            if author in rnames:
                rcount+=1
            
            pr_exp.append({'pr':number, 'exp':rcount})
            number-=1
            print(f'review count {rcount}')
        pr_exp_df = pd.DataFrame(pr_exp)
        pr_exp_df.exp = pr_exp_df.exp.astype('int')
        pr_exp_df.pr = pr_exp_df.pr.astype('int')
        pr_exp_df.exp = rcount - pr_exp_df.exp
        if pr_exp_df_old!=None:
            pr_exp_df = pd.concat([pr_exp_df_old, pr_exp_df]).sort_values(by='pr', ascending=False)
            pr_exp_df.reset_index(drop=True, inplace=True)
        pr_exp_df.to_csv(f'exp_data/{project_name}.csv')
        return rcount
    else:
        return rcount
Exemple #6
0
def from_lpdata(name):
    filename = '%s_lpdata.txt' % name
    df = pd.from_csv(filename)
Exemple #7
0
def create_files(n=n, copula_string='independence-copula', path=path,df=pd.DataFrame.from_csv(data_file),dimkeys=dimkeys, index=index, kind=kind, marginal_string=marginal_string, method=method, segment_marginal=segment_marginal):
    """
    This function creates csv files that correspond to dataframe we use.
    It creates U sample and P sample and return a column of S sample (cf documentation).

    :param n: size of the sample
    :param copula_string: The kind of copula you want to compute
    :param datatype : string whether it is 'actuals' or 'errors'
    :param index_diag : index of the diagonal where you want to project
    :return: Vector of the S value (cf latex for the meaning of U, O, P, Q,R, S
    """

    l = len(df.index)
    if method == 'wholeyear':
        dt_list = df.index
    elif method=='daytoday':
        dt_list = df.index[100:l-1]

    dimension = len(dimkeys)


    dict_epis_parameters = dict.fromkeys(dimkeys)
    if marginal_string == 'univariate-epispline':
        for i in dimkeys:
            if os.path.isfile(path + '/epispline_parameters_'+str(i)+'_' + marginal_data + '.csv'):
                dict_epis_parameters[i] = pd.DataFrame.from_csv(path + '/epispline_parameters_'+str(i)+'_' + marginal_data + '.csv')
            else:
                dict_epis_parameters[i] = pd.DataFrame(None, index=dt_list,
                                                   columns=['alpha', 'beta', 'N', 'w0', 'u0', 'delta'])

    diago = diag(dimension)

    suffix = return_suffix(kind, index, dimkeys, copula_string,segment_marginal)

    P_file_exists = os.path.isfile(path + '/samples/P_' + suffix)
    if P_file_exists:
        df_P = pd.DataFrame.from_csv(path + '/samples/P_' + suffix)
    else:
        df_U = dict.fromkeys(dimkeys)
        for i in dimkeys:
            U_file_exists = os.path.isfile(path + '/samples/U_' + copula_string)
            if U_file_exists:
                df_U[i] = pd.from_csv(path + '/samples/U_' + copula_string +'_'+str(i)+ '.csv')
            else:
                df_U[i] = pd.DataFrame(None, index=dt_list, columns=range(n))

        df_P = pd.DataFrame(None, index=dt_list, columns=range(n))
    S = []


    if method == 'wholeyear':

        input = dict.fromkeys(dimkeys)
        for i in dimkeys:
            input[i] = df[i].values.tolist()

        marginals = dict.fromkeys(dimkeys)
        for i in dimkeys:
            marginals[i] = marginal_from_input(marginal_string, input[i], None, method, dict_epis_parameters[i])

        distr_class = distribution_factory(copula_string)
        mydistr = distr_class(dimkeys, input)

    for mydt in dt_list:
        #print(mydt)
        if method == 'daytoday':

            input = dict.fromkeys(dimkeys)
            for i in dimkeys:
                input[i] = df[i].loc[df.index < mydt].values.tolist()

            if segment_marginal == 'segmented':

                input_segmented = dict.fromkeys(dimkeys)
                for i in dimkeys:
                    segmented_df = segmenter.OuterSegmenter(df.loc[df.index < mydt], df,
                                                            'copula_experiments/segment_input_wind_FH'+str(i)+'.txt',
                                                            mydt).retval_dataframe()
                    input_segmented[i] = segmented_df[i].values.tolist()
                input_marginals= input_segmented
            else:
                input_marginals=input


            marginals = dict.fromkeys(dimkeys)
            for i in dimkeys:
                marginals[i] = marginal_from_input(marginal_string, input_marginals[i], mydt, method, dict_epis_parameters[i])

            distr_class = distribution_factory(copula_string)
            mydistr = distr_class(dimkeys, input)

        if P_file_exists:
            P = df_P.loc[mydt].values
        else:
            if U_file_exists:
                U = np.transpose(df_U.loc[mydt].values)
            else:
                U = mydistr.generates_U(n)
                for j in range(dimension):
                    df_U[dimkeys[j]].loc[mydt] = U[:, j]

            P = diago.proj_scalar(U, index, kind)
            df_P.loc[mydt] = P

        O = df.loc[mydt].loc[dimkeys].values
        Q = np.zeros(dimension)
        for i in range(dimension):
            Q[i] = marginals[dimkeys[i]].cdf(O[i])

        R = diago.proj_scalar(Q, index, kind, mydistr)
        counter = sum(1 for i in range(n) if P[i] <= R)
        S.append(counter / n)


    if marginal_string == 'univariate-epispline':
        for i in dimkeys:
            dict_epis_parameters[i].to_csv(path + '/epispline_parameters_'+str(i)+'_' + marginal_data + '.csv')

    if not P_file_exists:
        df_P.to_csv(path + '/samples/P_' + suffix)
        for i in dimkeys:
            df_U[i].to_csv(path + '/samples/U_' + copula_string +'_'+str(i)+ '.csv')

    return S
Exemple #8
0
def stock_price(symbol):
    file = 'data/{}'.format(symbol.upper())
    if not os.path.exists(file):
        pass
    return pd.from_csv(file)
Exemple #9
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
from collections import Counter
import os
import io

# This pulls in the cleaned file, removed nulls etc.
final_set = pd.from_csv(
    '/Users/jacobtryba/DSI/assignments/capstone2/data/cleaned_data.csv')

# This initiates a function to remove punctuation, and is then used by the three text columns
# to be used in the analysis - description, title, genre
punct_to_remove = string.punctuation


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', punct_to_remove))


final_set['description'] = final_set['description'].apply(
    lambda text: remove_punctuation(text))
final_set['title'] = final_set['title'].apply(
    lambda text: remove_punctuation(text))
final_set['genre'] = final_set['genre'].apply(
    lambda text: remove_punctuation(text))

# This initiates a function to remove stop words and is then used on the description column
stop_words = set(stopwords.words('english'))
Exemple #10
0
import pandas as pd
df = pd.from_csv('build.csv')
print df
Exemple #11
0
def load_fit_manif2table(unit_list,
                         netname,
                         dataroot,
                         ang_step=9,
                         save=True,
                         load=False,
                         GANname="",
                         savestr=""):
    """Load experiments into table, Algorithmic version
    Esp. it load evolution information into the tab.
    load: if true, it will load saved stats table instead of computing a new one. 
    """
    if load:
        nettab = pd.from_csv(
            join(dataroot, "summary",
                 '%s_ManifExpFitSum%s.csv' % (netname, savestr)))
        return nettab
    theta_arr = np.arange(-90, 90.1, ang_step) / 180 * np.pi
    phi_arr = np.arange(-90, 90.1, ang_step) / 180 * np.pi
    stat_col = []
    for unit in unit_list[:]:
        layer = unit[1]
        layerdir = "%s_%s_manifold-%s" % (netname, layer, GANname)
        RFfit = unit[-1]
        suffix = "rf_fit" if RFfit else "original"
        npyfns = glob(join(dataroot, layerdir, "*.npy"))
        if len(unit) == 6:
            pattern = re.compile("Manifold_score_%s_(\d*)_%d_%d_%s.npy" %
                                 (layer, unit[3], unit[4], suffix))
        else:
            pattern = re.compile("Manifold_score_%s_(\d*)_%s.npy" %
                                 (layer, suffix))
        matchpatt = [pattern.findall(fn) for fn in npyfns]
        iChlist = [int(mat[0]) for mat in matchpatt if len(mat) == 1]
        fnlist = [fn for mat, fn in zip(matchpatt, npyfns) if len(mat) == 1]
        print("Found %d units in %s - %s layer!" %
              (len(iChlist), netname, layer))
        for iCh in iChlist:  # range
            if len(unit) == 6:
                unit_lab = "%s_%d_%d_%d" % (layer, iCh, unit[3], unit[4])
            elif len(unit) == 4:
                unit_lab = "%s_%d" % (
                    layer,
                    iCh,
                )
            else:
                raise NotImplementedError
            explabel = "%s_%s" % (unit_lab, suffix)
            data = np.load(
                join(dataroot, layerdir, "Manifold_score_%s.npy" % (explabel)))
            Mdata = np.load(
                join(dataroot, layerdir, "Manifold_set_%s.npz" % (explabel)))
            # final generation activation from Evolution
            gens = Mdata["evol_gen"]
            finalscores = Mdata["evol_score"][gens == gens.max()]
            initscores = Mdata["evol_score"][gens == (gens.min() + 1)]
            tval, pval = ttest_1samp(finalscores, initscores.mean())
            for spi in range(data.shape[0]):  # all spaces
                unitstat = EasyDict()
                if len(unit) == 6:
                    unitstat.pos = (unit[3], unit[4])
                elif len(unit) == 4:
                    unitstat.pos = None
                actmap = data[spi, :, :]  # PC2-3 space
                param, param_std, _, R2 = fit_Kent_Stats(theta_arr=theta_arr,
                                                         phi_arr=phi_arr,
                                                         act_map=actmap)
                unitstat.netname = netname
                unitstat.layer = layer
                unitstat.iCh = iCh
                unitstat.explabel = explabel
                unitstat.space = spi
                unitstat.RFfit = RFfit
                unitstat.imgsize = Mdata["imgsize"]
                unitstat.corner = Mdata["corner"]
                # Maximal activation from Manifold,
                unitstat.actmax = actmap.max()
                unitstat.actmin = actmap.min()
                unitstat.evolfinact = finalscores.mean()
                unitstat.evolttest = tval
                unitstat.evolttest_p = pval
                # Fitting stats
                unitstat.R2 = R2
                for i, pnm in enumerate(param_names):
                    unitstat[pnm] = param[i]
                    unitstat[pnm + "_std"] = param_std[i]
                # Append to collection
                stat_col.append(unitstat)

    nettab = pd.DataFrame(stat_col)
    if save:
        os.makedirs(join(dataroot, "summary"), exist_ok=True)
        nettab.to_csv(
            join(dataroot, "summary",
                 '%s_ManifExpFitSum%s.csv' % (netname, savestr)))
    return nettab
    if args['dataset'] == 'CIFAR':
        model = CIFAR_Model()
        _, valid_loader, _, _ = create_loaders(is_train=False,
                                               is_valid=True,
                                               is_test=False)
    else:
        model = MNIST_Model()
        _, valid_loader, _, _ = create_loaders(which_dataset='MNIST',
                                               is_train=False,
                                               is_valid=True,
                                               is_test=False)

    mask = create_mask(model)
    model.load_state_dict(torch.load('environment\\model.pth'))
    mask.load_state_dict(torch.load('environment\\mask.pth'))

    criterion, _ = create_criterion_optimizer(model)

    valid_loss = .0
    model.eval()
    with torch.no_grad():
        for images, labels in valid_loader:
            apply_mask(model, mask)
            outputs = model(images)
            valid_loss += criterion(outputs, labels).item()

        valid_loss /= len(valid_loader)

    df = pd.from_csv('wrapping\\pruned_layers.csv', sep=',')
    df[-1, -1] = valid_loss
    df.to_csv('wrapping\\pruned_layers.csv', sep=',')
Exemple #13
0
import pandas as pd

df = pd.from_csv('EUR_USD_2000/EUR_USD_SECOND_QUARTER_2000.csv')

rows_list = []
for row in input_rows:

    dict1 = {}
    # get input row in dictionary format
    # key = col_name
    dict1.update()

    rows_list.append(dict1)

df = pd.DataFrame(rows_list)
# In[8]:

df

# In[7]:

df['tokens_clean'] = df['tokens_nostop'].apply(lambda x: [" ".join(x)])
#df['tokens_clean'] = df['tokens_nostop'].apply(lambda x: [str(x)])
#" ".join([str(item) for var in data for item in var])

# In[ ]:

df.to_csv('cleaned_review_data.csv')
#df = pandas.read_csv('../data/cleaned_review_data.csv')
df = pandas.from_csv('../data/cleaned_review_data.csv', encoding='utf-8')

# In[13]:

import csv
df = csv.reader('../data/cleaned_review_data.csv')

# In[ ]:

df2 = pandas.DataFrame()
# r.to_csv("data.csv")
# grouped.to_csv("nodupes_alllyrics_tokenized.csv")
df2.from_csv("../data/cleaned_review_data.csv")
df2

# In[77]:
Exemple #15
0
def stock_price(symbol):
    file = 'data/{}'.format(symbol.upper())
    if not os.path.exists(file):
        pass
    return pd.from_csv(file)
Exemple #16
0
    resRows = rows - window + 1

    res = DataFrame(columns=range(window*cols))
    for idx in range(resRows):
        if window-1 > 0:
            newRow = c(inp[idx,], t(inp[(idx+1):(idx+window-1),]))
        elif:
            newRow = inp[idx,]

        if idx %% 10000 == 0:
            print(idx)
        res[idx,] = t(newRow)
    return(res)

# Read training data into memory
iotRaw = pd.from_csv('resources/normal_20170202_2229.csv')

# Select training columns
iot = iotRaw[["LinAccX..g.","LinAccY..g.","LinAccZ..g."]]

# Set training window and ngram
window = 200
iot = ngram(iot, window)

# Send the data to H2O
iot_hex  = h2o.H2OFrame(iot)

# Run the deeplearning model in autoencoder mode
neurons = 50
iot_dl = h2o.estimators.deeplearning.H2OAutoEncoderEstimator(model_id = "iot_dl",
                                                             autoencoder = TRUE,
Exemple #17
0
def load_facebook():
    return pd.from_csv("facebook/normalized-facebook-insights-post-level-data.csv")
Exemple #18
0
def file_selection_from_csv(filename):
    return ESGFFileSelection(pd.from_csv(filename))
def readTweets():
    return list(pd.from_csv('data/tweets.csv'))