Example #1
0
def get_silhouette(df):
    df=df[(df.AB!=".")].copy()
    df.loc[:,'AB']=pd.to_numeric(df.loc[:,'AB'])
    df.loc[:,'CN']=pd.to_numeric(df.loc[:,'CN'])

    tp=df.iloc[0,:].loc['svtype']

    [mn_CN, mn_AB]=df.loc[:, ['CN', 'AB']].mean(skipna=True)
    [sd_CN, sd_AB]=df.loc[:, ['CN', 'AB']].std(skipna=True)

    if df.loc[:,'GT'].unique().size==1:
        df.loc[:,'sil_gt_avg']=1
        df.loc[:, 'sil_gt']=1
        df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
        return df

    #standardize the 2 dims
    if sd_AB>0.01:
        df.loc[:, 'AB1']=(df.loc[:,'AB']-mn_AB)/sd_AB
    else: 
        df.loc[:, 'AB1']=df.loc[:, 'AB']
    if tp in ['DEL', 'DUP', 'MEI'] or sd_CN>0.01:
        df.loc[:, 'CN1']=(df.loc[:,'CN']-mn_CN)/sd_CN
    else:
        df.loc[:, 'CN1']=df.loc[:, 'CN']

    
    gt_code={'0/0':1, '0/1':2, '1/1':3}
    df.loc[:,'gtn']=df.loc[:, 'GT'].map(gt_code)

    dist_2d_sq=spatial.distance.squareform(spatial.distance.pdist(df[['AB1', 'CN1']], metric='cityblock'))
    df.loc[:, 'sil_gt_avg']=metrics.silhouette_score(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
    df.loc[:, 'sil_gt']=metrics.silhouette_samples(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
    df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
    return df
Example #2
0
def parse_bammarkduplicates(fn):
    """
    Parse the output from biobambam2's bammarkduplicates and return as pandas
    Series.

    Parameters
    ----------
    fn : str
        Path to the output file to parse.

    Returns
    -------
    metrics : pandas.Series
        Duplicate metrics.

    hist : pandas.Series
        Duplicate histogram.

    """
    with open(fn) as f:
        lines = [x.strip().split('\t') for x in f.readlines()]
    metrics = pd.Series(lines[4], lines[3])
    m = pd.to_numeric(metrics[metrics.index[1:]])
    metrics[m.index] = m.values

    vals = np.array(lines[8:-1])
    hist = pd.Series(vals[:, 1], index=[int(float(x)) for x in vals[:, 0]])
    hist = pd.to_numeric(hist)
    return metrics, hist
Example #3
0
def plot_week_data_facet(df, sample_type, metric, hue=None, hide_donor_baseline=False, hide_control_baseline=False, dm=None, save=True):
    df['week'] = pd.to_numeric(df['week'], errors='coerce')
    df[metric] = pd.to_numeric(df[metric], errors='coerce')
    df = df.sort_values(by='week')
    asd_data = filter_sample_md(df, [('SampleType', sample_type), ('Group', 'autism')])
    order = sorted(asd_data['SubjectID'].unique())

    grid = sns.FacetGrid(asd_data, col="SubjectID", hue=hue, col_wrap=6, size=1.5, palette=palette,
                         col_order=order)

    control_y = np.median(control_metric(df, sample_type, metric=metric))

    grid.map(plt.plot, "week", metric, marker="o", ms=4)

    if not hide_control_baseline:
        grid.map(plt.axhline, y=control_y, ls="--", c=palette['neurotypical'])
    if not hide_donor_baseline:
        donor_initial_y = np.median(donor_metric(df, metric=metric, group='donor-initial', sample_type=sample_type))
        donor_maintenance_y = np.median(donor_metric(df, metric=metric, group='donor-maintenance', sample_type=sample_type))
        grid.map(plt.axhline, y=donor_initial_y, ls="--", c=palette['donor'])
        grid.map(plt.axhline, y=donor_maintenance_y, ls=":", c=palette['donor'])
    if dm is not None:
        inter_nt_dm = inter_neurotypical_distances(df, dm, sample_type=sample_type)
        median_inter_nt = np.median(inter_nt_dm.condensed_form())
        grid.map(plt.axhline, y=median_inter_nt,
            color=palette['neurotypical'], linestyle='-.', label='between neurotypical distance (median)')

    grid.set(xticks=[0, 3, 10, 18], xlim=(-0.5, 18.5))
    grid.set_axis_labels("", "")
    grid.fig.tight_layout(w_pad=1)
    if save:
        filename = '%s-%s-%s-detail.pdf' % (sample_type, metric.replace(' ', '-'), hue)
        grid.savefig('engraftment-plots/%s' % filename)
    return grid
Example #4
0
    def import_data(self):
        '''
        Reads ICICIDirect  csv file row by row and populates one Transaction Fileeach
        companyDB is with companyName as key and multiple transactions as values
        '''
#         companyDB = defaultdict(list)
#         f = open(self.csvFileName, 'r')  #  opens the csv file
#         try:
#             reader = csv.reader(f, delimiter=',', quotechar='|')  # Creates the reader object
#             for row in reader:  #  Iterates the rows of the file in orders
#                 date = datetime.datetime.strptime(row[0], "%d-%b-%y").date()
#                 company = row[1]
#                 action = row[2]
#                 quantity = int(row[3])
#                 rate = float(row[4])
#                 brokerage = float(row[6])  # row[5] is TOTAL cost, ignored, as all the rest
#                 transaction = Transaction(date, company, action, quantity, rate, brokerage)
#                 companyDB.setdefault(company, []).append(transaction)
#         finally:
#             f.close()  #  closing
#         return companyDB
        if os.path.isfile(self.csvFileName):
            column_names = ['Date','CompanyName','OrderType','OrderQuantity','OrderPrice','OrderTotal','OrderCommision','OrderId1','OrderId2','OrderRolling','Account','Exchange']
            df = pd.read_csv(self.csvFileName, header=None, names = column_names)
            df = df[['Date','CompanyName','OrderType','OrderQuantity','OrderPrice']]
            df['Date'] = pd.to_datetime(df['Date'], format="%d-%b-%y").dt.date
            df['OrderQuantity'] = pd.to_numeric(df['OrderQuantity'].astype(str).str.replace(',',''), errors='coerce')
            df['OrderPrice'] = pd.to_numeric(df['OrderPrice'].str.replace(",","").astype(str).str.replace(',',''), errors='coerce')
            df = df.sort_values(['CompanyName','Date'], ascending=[True,True])
            list_of_companies = []
            for name, subdf in df.groupby('CompanyName'):
                list_of_records = subdf.to_dict('records')
                transactions = [Transaction(rec['Date'],rec['CompanyName'],rec['OrderType'],rec['OrderQuantity'],rec['OrderPrice'],0) for rec in list_of_records]
                list_of_companies.append(Company(name, transactions))
        self.list_of_companies = list_of_companies
Example #5
0
    def clean_pitching(self):
        """
        Does basic cleaning of picthing table
        """
        players_pitching = self.players_pitching.drop('index', axis=1)
        players_pitching = players_pitching[players_pitching.HR != 'HR']
        players_pitching.Tm = players_pitching.Tm.apply(
            lambda x: filter(lambda y: y in printable, x))
        # dropping rows that has 'Teams' in Teams column
        players_pitching.Tm = players_pitching['Tm'].apply(
            lambda x: 'toss_away' if 'Teams' in x else x)
        players_pitching = players_pitching[players_pitching.Tm != 'toss_away']

        # dropping W-L perc as it is largely correlated
        players_pitching = players_pitching.drop('W-L_perc', axis=1)
        # fill missing Aff with N/A
        players_pitching.Aff = players_pitching.Aff.fillna('N/A')
        # dropping rows with missing AgeDif
        players_pitching = players_pitching[players_pitching.AgeDif.notnull()]
        # fill the rest of missing values with 0
        players_pitching = players_pitching.fillna(0)

        # dropping players whose age is '--'
        for id in self.drop_id_lst:
            players_pitching = players_pitching[players_pitching[
                'player_id'] != id]

        players_pitching[['Age', 'AgeDif']] = players_pitching[
            ['Age', 'AgeDif']].apply(lambda x: pd.to_numeric(x))
        players_pitching[players_pitching.columns[8:]] =\
            players_pitching[players_pitching.columns[8:]].apply(
                lambda x: pd.to_numeric(x))
Example #6
0
def test_really_large_in_arr_consistent(large_val, signed,
                                        multiple_elts, errors):
    # see gh-24910
    #
    # Even if we discover that we have to hold float, does not mean
    # we should be lenient on subsequent elements that fail to be integer.
    kwargs = dict(errors=errors) if errors is not None else dict()
    arr = [str(-large_val if signed else large_val)]

    if multiple_elts:
        arr.insert(0, large_val)

    if errors in (None, "raise"):
        index = int(multiple_elts)
        msg = "Integer out of range. at position {index}".format(index=index)

        with pytest.raises(ValueError, match=msg):
            to_numeric(arr, **kwargs)
    else:
        result = to_numeric(arr, **kwargs)

        if errors == "coerce":
            expected = [float(i) for i in arr]
            exp_dtype = float
        else:
            expected = arr
            exp_dtype = object

        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
Example #7
0
def main():
    datasets_path = r'D:\data\jdcl\datasets\2017-11-20_2017-12-23.csv'
    model_path = r'D:\model\randomforest_2017-11-20_2017-12-23.p'
    cols = []
    # model = pickle.load(open(model_path))
    label_name = 'label_class'
    df = pd.read_csv(datasets_path)
    for col in df.columns:
        try:
            pd.to_numeric(df[col])
            cols.append(col)
        except BaseException:
            # print(col)
            # traceback.print_exc()
            pass
    date_1 = '2017-12-01'
    date_2 = '2017-12-10'
    date_3 = '2017-12-30'
    TdX = df.loc[df.create_time < date_1, cols].values
    TsX = df.loc[(df.create_time > date_1) & (df.create_time < date_2), cols].values
    S = df.loc[(df.create_time > date_2) & (df.create_time < date_3), cols].values
    label_df = df[[label_name, 'create_time']]
    label_d_df = label_df.loc[label_df.create_time < date_1, label_name].values
    label_s_df = label_df.loc[(label_df.create_time > date_1) & (label_df.create_time < date_2), label_name].values
    classifier = TrAdaBoostClassifier()
    result = classifier.fit(TdX, label_d_df, TsX, label_s_df, S, base_classifier=DecisionTreeClassifier)
    print(TdX.shape[0] + TsX.shape[0])
    np.save('result.npy', result)
Example #8
0
def get_data_ethbtc():
    order_book = public_client.get_product_order_book('ETH-BTC', level=3)
    ask_tbl = pd.DataFrame(data=order_book['asks'], columns=['price', 'volume', 'address'])
    bid_tbl = pd.DataFrame(data=order_book['bids'], columns=['price', 'volume', 'address'])

    # building subsetted table for ask data only
    # sell side (would be Magma)
    ask_tbl['price'] = pd.to_numeric(ask_tbl['price'])
    ask_tbl['volume'] = pd.to_numeric(ask_tbl['volume'])
    first_ask = float(ask_tbl.iloc[1, 0])
    perc_above_first_ask = (1.025 * first_ask)
    ask_tbl = ask_tbl[(ask_tbl['price'] <= perc_above_first_ask)]
    ask_tbl['color'] = 'red'

    # building subsetted table for bid data only
    # buy side (would be Viridis)
    bid_tbl['price'] = pd.to_numeric(bid_tbl['price'])
    bid_tbl['volume'] = pd.to_numeric(bid_tbl['volume'])
    first_bid = float(bid_tbl.iloc[1, 0])
    perc_above_first_bid = (0.975 * first_bid)
    bid_tbl = bid_tbl[(bid_tbl['price'] >= perc_above_first_bid)]
    bid_tbl['color'] = 'green'

    # append the buy and sell side tables to create one cohesive view
    fulltbl = bid_tbl.append(ask_tbl)
    # limit our view to only orders greater than or equal to 1 ETH in size
    fulltbl = fulltbl[(fulltbl['volume'] >= 1)]

    # takes the square root of the volume (to be used later on for the purpose of sizing the orders
    fulltbl['sqrt'] = np.sqrt(fulltbl['volume'])
    # takes average of closet bid and ask to determine the market price
    fulltbl['market_price'] = ((perc_above_first_ask + perc_above_first_bid) / 2)

    return fulltbl
Example #9
0
def _get_Laskar_data(verbose=True):
    longorbit = {}
    sources = {}
    pandas_kwargs = {'delim_whitespace':True,
                     'header':None,
                     'index_col':0,
                     'names':['kyear','ecc','obliquity','long_peri'],}
    for time in filenames:
        local_path = os.path.join(os.path.dirname(__file__), "data", filenames[time])
        remote_path = base_url + filenames[time]
        if time is 'future':
            pandas_kwargs['skiprows'] = 1 # first row is kyear=0, redundant
        longorbit[time], path = load_data_source(local_path=local_path,
                remote_source_list=[remote_path],
                open_method = pd.read_csv,
                open_method_kwargs=pandas_kwargs,
                verbose=verbose)
        sources[time] = path
    xlongorbit = {}
    for time in ['past', 'future']:
        # Cannot convert to float until we replace the D notation with E for floating point numbers
        longorbit[time].replace(to_replace='D', value='E', regex=True, inplace=True)
        xlongorbit[time] = xr.Dataset()
        xlongorbit[time]['ecc'] = xr.DataArray(pd.to_numeric(longorbit[time]['ecc']))
        for field in ['obliquity', 'long_peri']:
            xlongorbit[time][field] = xr.DataArray(np.rad2deg(pd.to_numeric(longorbit[time][field])))
    longorbit = xr.concat([xlongorbit['past'], xlongorbit['future']], dim='kyear')
    # add 180 degrees to long_peri (see lambda definition, Berger 1978 Appendix)
    longorbit['long_peri'] += 180.
    longorbit['precession'] = longorbit.ecc*np.sin(np.deg2rad(longorbit.long_peri))
    longorbit.attrs['Description'] = 'The Laskar et al. (2004) orbital data table'
    longorbit.attrs['Citation'] = 'https://doi.org/10.1051/0004-6361:20041335'
    longorbit.attrs['Source'] = [sources[time] for time in sources]
    longorbit.attrs['Note'] = 'Longitude of perihelion is defined to be 0 degrees at Northern Vernal Equinox. This differs by 180 degrees from the source files.'
    return longorbit
Example #10
0
def coerce_types(df, field_properties):
    decimal_fields = []
    integer_fields = []
    string_fields = []
    datetime_fields = []

    for fp in field_properties:
        name = fp['name']
        data_type = fp['type']
        if data_type in fields_to_coerce_to_float:
            decimal_fields.append(name)
        elif data_type in fields_to_coerce_to_integer:
            integer_fields.append(name)
        elif data_type in fields_to_coerce_to_string:
            string_fields.append(name)
        elif data_type in fields_to_coerce_to_datetime:
            datetime_fields.append(name)

    # Forcing data types
    for decimal_field in decimal_fields:
        df[decimal_field] = pd.to_numeric(df[decimal_field], errors='coerce')

    for integer_field in integer_fields:
        df[integer_field] = pd.to_numeric(df[integer_field], errors='coerce')

    for datetime_field in datetime_fields:
        df[datetime_field] = pd.to_datetime(
            df[datetime_field],
            errors='coerce',
            infer_datetime_format=True
        )
    return df
Example #11
0
    def test_numeric(self):
        s = pd.Series([1, -3.14, 7], dtype='O')
        res = to_numeric(s)
        expected = pd.Series([1, -3.14, 7])
        tm.assert_series_equal(res, expected)

        s = pd.Series([1, -3.14, 7])
        res = to_numeric(s)
        tm.assert_series_equal(res, expected)

        # GH 14827
        df = pd.DataFrame(dict(
            a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'],
            b=[1.0, 2.0, 3.0, 4.0],
        ))
        expected = pd.DataFrame(dict(
            a=[1.2, 3.14, np.inf, 0.1],
            b=[1.0, 2.0, 3.0, 4.0],
        ))

        # Test to_numeric over one column
        df_copy = df.copy()
        df_copy['a'] = df_copy['a'].apply(to_numeric)
        tm.assert_frame_equal(df_copy, expected)

        # Test to_numeric over multiple columns
        df_copy = df.copy()
        df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric)
        tm.assert_frame_equal(df_copy, expected)
Example #12
0
def get_data(ticker, start, end):
    
    global missing_tickers
    df = None

    # Gets stock data from web
    try:
        # Creates Google Finance url
        url = '''http://www.google.com/finance/historical?q=''' + ticker.replace('^', '-') + '''&startdate=''' + start.strftime("%B")[:3]
        url += '''%20''' + str(start.day) + ''',%20''' + str(start.year) + '''&enddate=''' + end.strftime("%B")[:3] + '''%20'''
        url += str(end.day) + ''',%20''' + str(end.year) + '''&output=csv'''
        
        df = pd.read_csv(url).rename(columns = {'\xef\xbb\xbfDate': 'Date'})
        df[open_str] = pd.to_numeric(df[open_str], errors = 'coerce')
        df[close_str] = pd.to_numeric(df[close_str], errors = 'coerce')
    except:
        try:
            # Creates yahoo finance url
            url = '''http://ichart.finance.yahoo.com/table.csv?s=''' + ticker + '''&a=''' + str(start.month - 1)
            url += '''&b=''' + "%02d" % start.day + '''&c=''' + str(start.year) + '''&d=''' + str(end.month - 1)
            url += '''&e=''' + "%02d" % end.day + '''&f=''' + str(end.year) + '''&g=d&ignore=.csv'''
            df = pd.read_csv(url).rename(columns = {close_str: 'Non Adjust Close', 'Adj Close': close_str})            
        except:
            missing_tickers.append(ticker)

    if (df is not None) and (len(df) > 0):
        df[return_str] = df.apply(lambda row: return_rate(row[open_str], row[close_str]), axis = 1)
        df['ticker'] = ticker
        df['Date'] = pd.to_datetime(df['Date'])
    
    return df
def get_empty_summary_df():
    # create an empty dataframe with just the site_names, xs, ys, and srcs to fill in the summary data
    empty_daily_tots_df = get_data_frame_from_table('sites_list')
    empty_daily_tots_df = empty_daily_tots_df.set_index('site_name')
    empty_daily_tots_df['x'] = pd.to_numeric(empty_daily_tots_df['x'])
    empty_daily_tots_df['y'] = pd.to_numeric(empty_daily_tots_df['y'])
    return empty_daily_tots_df
def get_concentration_functions(composition_table_dict):

    meta = composition_table_dict['meta']
    composition_table = Table.from_dict(composition_table_dict['data'])
    elements = [col for col in composition_table.columns if col not in meta]
    x = composition_table["X"].values
    y = composition_table["Y"].values
    cats = composition_table["X"].unique()
    concentration, conc, d, y_c, functions = {}, {}, {}, {}, RecursiveDict()

    for el in elements:
        concentration[el] = to_numeric(composition_table[el].values)/100.
        conc[el], d[el], y_c[el] = {}, {}, {}

        if meta['X'] == 'category':
            for i in cats:
                k = '{:06.2f}'.format(float(i))
                y_c[el][k] = to_numeric(y[where(x==i)])
                conc[el][k] = to_numeric(concentration[el][where(x==i)])
                d[el][k] = interp1d(y_c[el][k], conc[el][k])

            functions[el] = lambda a, b, el=el: d[el][a](b)

        else:
            functions[el] = interp2d(float(x), float(y), concentration[el])

    return functions
Example #15
0
def parse_mark_duplicate_metrics(fn):
    """
    Parse the output from Picard's MarkDuplicates and return as pandas
    Series.

    Parameters
    ----------
    filename : str of filename or file handle
        Filename of the Picard output you want to parse.

    Returns
    -------
    metrics : pandas.Series
        Duplicate metrics.

    hist : pandas.Series
        Duplicate histogram.

    """
    with open(fn) as f:
        lines = [x.strip().split('\t') for x in f.readlines()]
    metrics = pd.Series(lines[7], lines[6])
    m = pd.to_numeric(metrics[metrics.index[1:]])
    metrics[m.index] = m.values

    vals = np.array(lines[11:-1])
    hist = pd.Series(vals[:, 1], index=[int(float(x)) for x in vals[:, 0]])
    hist = pd.to_numeric(hist)
    return metrics, hist
Example #16
0
def boxplotArray(data, pGroups=None, thr=None, ax=None):
  if ax is None:
    w,h = (6.4, 4.8)
    dpi = 100
    fig = plt.figure(figsize=(w,h))
    ax = fig.add_axes([70.0/w/dpi, 54.0/h/dpi, 1-2*70.0/w/dpi, 1-2*54.0/h/dpi])
  if pGroups is None:
    df = pd.DataFrame()
    df["x"] = pd.to_numeric(pd.Series(data[2:]))
    ax.boxplot(df["x"])
  else:
    bdata = []
    for k in range(len(pGroups)):
      df = pd.DataFrame()
      order = pGroups[k][2]
      val = [data[i] for i in order if data[i] != ""]
      df["x"] = pd.to_numeric(pd.Series(val))
      bdata.append(df["x"])      
    bp = ax.boxplot(bdata, patch_artist=True)
    ax.set_xticklabels([ g[0] for g in pGroups ])
    for k in range(len(pGroups)):
      bp['boxes'][k].set(color = pGroups[k][1])
      bp['boxes'][k].set(facecolor = pGroups[k][1], alpha=0.2)
      bp['fliers'][k].set(color = pGroups[k][1])
      bp['medians'][k].set(color = pGroups[k][1])
    for k in range(2*len(pGroups)):
      bp['whiskers'][k].set(color = pGroups[k/2][1])
      bp['caps'][k].set(color = pGroups[k/2][1])
  ax.set_ylabel(data[1])
  if thr is not None:
    ax.axhline(y=thr[1], color='r')
    ax.axhline(y=thr[3], color='cyan')
    ax.axhline(y=thr[4], color='cyan')
  return ax
Example #17
0
def readThreeColumnTruth(fn, suffix=""):
    df = pd.read_csv(fn, sep=' ', skiprows=1,
                     names=['Name', 'Gene{}'.format(suffix),
                            'TPM{}'.format(suffix)], engine='c')
    df.set_index("Name", inplace=True)
    pd.to_numeric(df["TPM{}".format(suffix)], errors='ignore')
    return df
def get_df_from_annovar_csv(df, chunk_ids):

    df = df.rename(columns={'1000g2015aug_all': 'ThousandGenomeAll'})
    df.Chr = df.Chr.replace(to_replace='chrM', value='chrMT')
    df['Start'] = pandas.to_numeric(df['Start'])
    df['End'] = pandas.to_numeric(df['End'])
    df["nci60"] = utilities.to_float(df, "nci60")
    df["ThousandGenomeAll"] = utilities.to_float(df, "ThousandGenomeAll")
    df["ESP6500si_ALL"] = utilities.to_float(df, "ESP6500si_ALL")
    df["tfbsConsSites"] = df["tfbsConsSites"].dropna().apply(utilities.cell_to_dict)

    utilities.split_string(df, "Func.knownGene")
    utilities.split_string(df, "ExonicFunc.knownGene")

    #df["targetScanS"] = df["targetScanS"].dropna().apply(utilities.cell_to_dict)

    df["genomicSuperDups"] = df["genomicSuperDups"].dropna().apply(utilities.cell_to_dict)
    df["cytoBand"] = df["cytoBand"].dropna().apply(utilities.split_cytoband)
    df["cytoBand"] = df["cytoBand"].dropna().apply(utilities.lists_to_dict)
    df['hgvs_key'] = pandas.Series(chunk_ids)

    my_sample_id = df["Otherinfo"].dropna().apply(genotype_calling.split_sample_ID)
    genotype_call = my_sample_id.apply(lambda x: x[-2::])
    dict_split = genotype_call.apply(genotype_calling.return_dict)
    df['Otherinfo'] = dict_split
    df = df.rename(columns={'Otherinfo': 'Genotype_Call'})


    #Clean up dataframe
    df = utilities.modify_df(df)
    df_final = df.where((pandas.notnull(df)), None)

    return df_final
Example #19
0
def plotBooleanSelect(obj, pHash=None):
  info = getHegemonDataset(obj[5]);
  thr = getHegemonThr(obj[5], obj[0], obj[2]);
  thash = {}
  for v in thr:
    thash[v[0]] = v
  datax = getHegemonPtr(obj[4], obj[8])
  datay = getHegemonPtr(obj[4], obj[9])
  thrx = thash[str(obj[0])]
  thry = thash[str(obj[2])]
  df = pd.DataFrame()
  if pHash is None:
      df["x"] = pd.to_numeric(pd.Series(datax[1][2:]))
      df["y"] = pd.to_numeric(pd.Series(datay[1][2:]))
  else:
      order = [i for i in range(2, len(datax[0])) if datax[0][i] in pHash]
      val = [datax[1][i] for i in order]
      df["x"] = pd.to_numeric(pd.Series(val))
      val = [datay[1][i] for i in order]
      df["y"] = pd.to_numeric(pd.Series(val))
  ax = df.plot.scatter(x='x', y='y')
  ax.set_xlabel(obj[6])
  ax.set_ylabel(obj[7])
  ax.set_title("{0} (n = {1})".format(info[1], info[2]))
  ax.axhline(y=thry[1], color='r')
  ax.axhline(y=thry[3], color='cyan')
  ax.axhline(y=thry[4], color='cyan')
  ax.axvline(x=thrx[1], color='r')
  ax.axvline(x=thrx[3], color='cyan')
  ax.axvline(x=thrx[4], color='cyan')
  return ax
Example #20
0
    def get_k_frame(cls, con, code):
        k_df = tushare.bar(code, con, adj='qfq')
        k_df_size = k_df.index.size
        if not k_df_size: return k_df

        k_df['high'] = pandas.to_numeric(k_df['high'])
        k_df['low'] = pandas.to_numeric(k_df['low'])
        k_df['open'] = pandas.to_numeric(k_df['open'])
        k_df['close'] = pandas.to_numeric(k_df['close'])
        k_df['date'] = k_df.index
        k_df['date'] = k_df['date'].apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"))

        k_df = k_df.drop(['vol', 'amount'], axis=1)
        k_df = k_df.reset_index(drop=True)
        k_df = k_df.sort_values(by='date').reset_index(drop=True)
        k_df['k_pos'] = k_df.index

        def _get_per_change(x):
            if x == 0:
                _pc = (k_df.at[0, "close"] - k_df.at[0, "open"]) / k_df.at[0, "open"] * 100
            else:
                _pc = (k_df.at[x, "close"] - k_df.at[x-1, "close"]) / k_df.at[x-1, "close"] * 100
            return float("%0.2f" % _pc)

        k_df["per_change"] = k_df.k_pos.apply(_get_per_change)

        return k_df
def arrange_datas(MODEL_NUMBER):
    MODEL_PATH = './ahmet_models/model' + str(MODEL_NUMBER) + '.h5'
    #import h5py # to fix loading model problem
    #f = h5py.File(MODEL_PATH, 'r+')
    #del f['optimizer_weights']
    #f.close()

    model = load_model(MODEL_PATH)

    print("Train is being prepared..")
    model_new = Model(inputs=model.input, outputs=model.layers[-5].output)
    train_xgb= model_new.predict((X),batch_size=240, verbose=1)
    #model.layers
    print('\n New train shape: '+str(train_xgb.shape))
    train_xgb=pd.DataFrame(train_xgb)
    df['inc_angle'] = pd.to_numeric(df['inc_angle'], errors='coerce')
    train_xgb['angle'] =df['inc_angle']
    train_xgb['angle'] =train_xgb['angle'].fillna(train_xgb['angle'].median())
    train_xgb=np.array(train_xgb)

    print("\nTest is being prepared..")
    test_xgb= model_new.predict((X_test), batch_size=240,verbose=1)
    print('\n New test shape: '+str(test_xgb.shape))
    test_xgb=np.array(test_xgb)
    test_xgb=pd.DataFrame(test_xgb)
    df_test['inc_angle'] = pd.to_numeric(df_test['inc_angle'], errors='coerce')
    test_xgb['angle'] =df_test['inc_angle']
    test_xgb=np.array(test_xgb)
    return train_xgb,test_xgb
Example #22
0
def preprocess_people(data):

    # TODO refactor this duplication
    data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1])
    data['people_id'] = pd.to_numeric(data['people_id']).astype(int)

    data.date = pd.to_datetime(data.date)
    #  Values in the people df is Booleans and Strings
    columns = list(data.columns)
    columns.remove("date")
    bools = columns[11:]
    strings = columns[1:11]

    for col in bools:
        data[col] = pd.to_numeric(data[col]).astype(int)
    for col in strings:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col]).astype(int)

    # Add features from date
    data["year_p"] = data.date.apply(lambda x: x.year)
    data["month_p"] = data.date.apply(lambda x: x.month)
    data["day_p"] = data.date.apply(lambda x: x.day)

    data = data.drop(['date'], axis=1)
    return data
Example #23
0
def validate_split_data(
        raw_data,
        split_data,
        split_obj,
        float_limit=float(TEST_CONFIG.get('TEST', 'float_limit'))
):
    """validate data that did not split

    Args:
        raw_data (:obj:`pandas.DataFrame`): raw data (A group)
        split_data (:obj:`pandas.DataFrame`): split data (B group)
        split_obj (:obj:`split_utils.SplitInfo`): split information
        float_limit (float): maximum deviation for equality test

    Raises:
        AssertionError: asserts expected shapes

    """
    for column in split_data.columns.values:
        #print(split_data[column])
        #print(raw_data[column])
        if column == 'date':
            assert split_data[column].equals(raw_data[column])
        elif column == 'index':
            continue
        elif column in split_utils.PRICE_KEYS:
            diff = abs(
                pd.to_numeric(split_data[column]) - pd.to_numeric(raw_data[column]) * split_obj
            )
            assert diff.max() < float_limit
        else:
            diff = abs(
                pd.to_numeric(split_data[column]) - pd.to_numeric(raw_data[column]) / split_obj
            )
            assert diff.max() < float_limit
Example #24
0
def parse_xml(token):
    """Attempt to parse the XML into something useful"""
    root = ET.fromstring(token)
    hml = HMLData()
    hml.station = root.attrib['id']
    hml.stationname = root.attrib.get('name')
    hml.originator = root.attrib.get('originator')
    hml.generationtime = parseUTC(root.attrib['generationtime'])
    for child in root:
        if child.tag not in ['observed', 'forecast']:
            continue
        rows = []
        for datum in child.findall("datum"):
            secondary = datum.find('secondary')
            rows.append(dict(name=child.tag,
                             valid=parseUTC(datum.find('valid').text),
                             primary=datum.find('primary').text,
                             secondary=(secondary.text
                                        if secondary is not None
                                        else None)))
        mydict = hml.data[child.tag]
        df = pd.DataFrame(rows)
        df['primary'] = pd.to_numeric(df['primary'], errors='coerse')
        df['secondary'] = pd.to_numeric(df['secondary'], errors='coerse')
        mydict['dataframe'] = df
        mydict['issued'] = parseUTC(child.attrib.get('issued'))
        for attr in ['primaryName', 'secondaryName',
                     'primaryUnits', 'secondaryUnits']:
            mydict[attr] = child.attrib.get(attr)
    return hml
Example #25
0
def validate_plain_data(
        raw_data,
        split_data,
        float_limit=float(TEST_CONFIG.get('TEST', 'float_limit'))
):
    """validate data that did not split

    Args:
        raw_data (:obj:`pandas.DataFrame`): raw data (A group)
        split_data (:obj:`pandas.DataFrame`): split data (B group)
        float_limit (float): maximum deviation for equality test

    Returns:
        (None): asserts internally

    """
    for column in split_data.columns.values:
        print(split_data[column])
        print(raw_data[column])
        if column == 'date':
            assert split_data[column].equals(raw_data[column])
        elif column == 'index':
            continue
        else:
            diff = abs(pd.to_numeric(split_data[column]) - pd.to_numeric(raw_data[column]))
            assert diff.max() < float_limit
Example #26
0
def _readData(filepath):
    data = pandas.read_csv(filepath)
    data["Age"] = data["Age"].fillna(data["Age"].median())
    data.loc[data["Sex"] == 'male', "Sex"] = 0
    data.loc[data["Sex"] == 'female', "Sex"] = 1
    data["Embarked"] = data["Embarked"].fillna('S')
    data.loc[data["Embarked"] == 'S', "Embarked"] = 0
    data.loc[data["Embarked"] == 'C', "Embarked"] = 1
    data.loc[data["Embarked"] == 'Q', "Embarked"] = 2
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())

    # new features
    data["FamilySize"] = data["SibSp"] + data["Parch"]
    data["NameLength"] = data["Name"].apply(lambda x: len(x))

    # _family id
    # people with same (last name + family size) = family member
    family_id_mapping = {}

    def _getFamilyId(row):
        lastName = row["Name"].split(",")[0]
        family_id = lastName + str(row["FamilySize"])
        if family_id not in family_id_mapping:
            if len(family_id_mapping) == 0:
                current_id = 1;
            else:
                current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
            family_id_mapping[family_id] = current_id
        return family_id_mapping[family_id]

    family_ids = data.apply(_getFamilyId, axis=1)
    family_ids[data["FamilySize"] < 3] = -1
    data["FamilyId"] = family_ids
    # family id_

    #_get family title
    def _getTitle(name):
        title = re.search(" ([A-Za-z]+)\.",name)
        if title:
            return title.group(1)
        return ""
    titles = data["Name"].apply(_getTitle)
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9,
     "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
    '''
    j=0
    for i in titles:
        title_mapping[i]=j
        j=j+1
    '''

    for k,v in title_mapping.items():
        titles[titles == k] = v
    data["Title"] = titles
    data["Title"]=data["Title"].apply(lambda x: pandas.to_numeric(x,errors='coerce'))
    data.loc[data["Title"].apply(math.isnan),"Title"] = -1
    # get family title_

    data = data.apply(lambda x: pandas.to_numeric(x, errors='ignore'))
    return data
Example #27
0
def get_project_timeline():
    d = load_table('SafeWaterProjectMonthlySummary')

    # Remove invalid dates
    d = d[d['MonthAndYear'] != '2004-10-28']
    d = d[d['MonthAndYear'] != '2018-07-28']

    # Adjust for apparent date typo
    d['MonthAndYear'] = d['MonthAndYear'].where(d['MonthAndYear'] != '2105-06-28', '2015-06-28')
    d['MonthAndYear'] = d['MonthAndYear'].apply(lambda x: pd.to_datetime(x[:7]))

    # Rename date column
    d = d.rename(columns={'MonthAndYear': 'Date'})

    # Remove "Additional*" fields
    d = d[[c for c in d if not c.startswith('Additional')]]

    # Assessment ID + date conflicts do occur, so choose arbitrarily from between them
    d = d.groupby(['AssessmentID', 'Date'], group_keys=False).apply(lambda x: x.head(1))

    d['AssessmentID'] = pd.to_numeric(d['AssessmentID'], errors='coerce')

    d = d[d['AssessmentID'].notnull()]
    d = d[d['Date'].notnull()]

    d = d.set_index(['Date', 'AssessmentID'])
    for c in d:
        d[c] = pd.to_numeric(d[c], errors='coerce')

    return d
Example #28
0
def preprocess_acts(data, train_set=True):

    # Getting rid of data feature for now
    data = data.drop(['activity_id'], axis=1)
    data.date = pd.to_datetime(data.date)

    if(train_set):
        data = data.drop(['outcome'], axis=1)

    ## Split off _ from people_id
    data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1])
    data['people_id'] = pd.to_numeric(data['people_id']).astype(int)

    columns = list(data.columns)
    columns.remove("date")

    # Convert strings to ints
    for col in columns[1:]:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col]).astype(int)

    # Add features from date
    data["year"] = data.date.apply(lambda x: x.year)
    data["month"] = data.date.apply(lambda x: x.month)
    data["day"] = data.date.apply(lambda x: x.day)
    data = data.drop(["date"], axis = 1)
    return data
def convert_units_to_floats(path_base, out_path):
    files = util_path.get_files_by_name_ext(path_base, '.', 'csv')
    
    for f in files:
        #logging.debug("File {}".format(f))
        
        this_dir, basename = os.path.split(f)

        basename_noext, this_ext = os.path.splitext(basename)
        
        logging.info("Processing {}, a {} from {} ".format(basename_noext, this_ext, path_base))
        
        # Load as DF
        df = pd.read_csv(f, sep=";",encoding='utf-8')
        
        logging.debug("Loaded as DF: {}".format(df.shape))

        # Converting
        df["Volume"] = pd.to_numeric(df["Volume"].str.extract(r"(\d+\.\d*)", expand=False))
        df["Area"] = pd.to_numeric(df["Area"].str.extract(r"(\d+\.\d*)", expand=False))
        if 'Elevation' in df:
            df["Elevation"] = pd.to_numeric(df["Elevation"].str.extract(r"(\d+\.\d*)", expand=False))

        logging.debug("Converted Volume Area Elevation".format())
        
        
        #print(os.access(out_path, os.R_OK),os.access('foo.txt', os.W_OK))
        this_out_path = os.path.join(out_path, basename_noext+'.csv')
        logging.debug("Writing DF to {}".format(this_out_path))
        
        df.to_csv(this_out_path, sep = ';')
Example #30
0
def create_volume_time_graph(tank_type_var, file_path, file_path2 = None, date_from = None, date_to = None):
    """Creates a tank volumn in % vs time graph"""
    csv1, files_exist = open_files(file_path)

    if file_path2 != None:
        csv2, files_exist = open_files(file_path2)
        frames = [csv1, csv2]
        csv = pd.concat(frames)

    try:
        time = "Train end time [local_unit_time]" #get the time column
        csv = csv.sort(columns=time)
    except:
        time = "Report date"
        csv = csv.sort(columns=time)

    print files_exist

    if files_exist != True:
        return
    print "data analyzed"

    total_wheels = csv["Total wheels"]
    
    try:
        tank_level = csv["Product %"]
    except:
        tank_level = csv["Raw Level"]
    tank_level_mask = np.isfinite(tank_level)


    #find times when the control box is changed
    #counter_change = filtered_time.groupby(filtered_csv["Wheels TA"]).apply(lambda x: np.array(x))

    box_change, settings = get_setting_changes(csv, get_time(csv))
    #print box_change, settings, len(box_change), len(settings)
    #print total_wheels
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()

    ax1.plot(get_time(csv)[tank_level_mask], tank_level[tank_level_mask], label = "tank level")
    
    if file_path2 != None:
        voltage1 = pd.to_numeric(csv1['Volts'])
        voltage2 = pd.to_numeric(csv2['Volts'])
        ax2.plot(get_time(csv1), voltage1, color='Red', label = "voltage1", alpha=0.5)
        ax2.plot(get_time(csv2), voltage2, color='Darkred', label = "voltage2", alpha=0.5)
    else:
        voltage = pd.to_numeric(csv['Volts'])
        ax2.plot(get_time(csv), voltage, color='Red', label = "voltage", alpha=0.5)
    [ax1.axvline(x=i, ls='--', lw=2.0) for i in box_change]
    [ax1.annotate('{0} x {1}'.format(s[0],s[1]), xy=(box_change[i], 15)) for i, s in enumerate(settings)]
    ax1.legend(loc=0)
    ax2.legend(loc=0)
    plt.title("Tank level and voltage over time")
    ax1.set_ylabel("Tank Level", color='Blue')
    ax2.set_ylabel("Voltage", color='Red')
    ax2.set_ylim(0,15)
    plt.xlabel("Time")
    plt.show()
Example #31
0
 def time_from_str(self, errors):
     to_numeric(self.str, errors=errors)
Example #32
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('data/train.csv', index_col=0)

print df.shape
df.head()
df.shape

#df = df.fillna('0')
trainx = df.drop(['Churn', 'customerID'], axis=1)

trainy = df['Churn']

trainx['TotalCharges'] = pd.to_numeric(trainx['TotalCharges'], errors='coerce')
trainx = trainx.fillna('0')
trainx.shape

from scipy import stats
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(trainx, trainy)
from sklearn import metrics


# Combine all relevant outputs into one function
def print_performance(y_true, y_pred):
    display(pd.DataFrame(metrics.confusion_matrix(y_true, y_pred)))
    print(metrics.classification_report(y_true, y_pred))
Example #33
0
#Leemos el fichero Json directamente desde una URL sin descargarlo en local
#url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
#json_url = urllib.request.urlopen(url)
#datos_json = json.loads(json_url.read())

#Filtramos el Dataframe para el país indicado
datospais_df = datos[datos['countryterritoryCode'] == 'ESP']

#Convertimos la fecha de formato cadena a formato fecha para poder filtrar un intervalo de fechas
datospais_df['dateRep'] = pd.to_datetime(datospais_df['dateRep'],
                                         format='%d/%m/%Y')
#datospais_df['dateRep'] = pd.to_datetime(datospais_df['dateRep'], dayfirst = True)
print(datospais_df['dateRep'])

#Convertimos el número de casos de formato cadena a formato número para poder calcular las medias móviles
datospais_df['cases'] = pd.to_numeric(datospais_df['cases'])

#Ordenamos los datos por fecha ascendentemente
DatosOrdenadosPorFecha_df = datospais_df.sort_values(by='dateRep',
                                                     ascending=True)

#Calculamos la media móvil de los últimos 14 días
DatosOrdenadosPorFecha_df['moving14'] = DatosOrdenadosPorFecha_df[
    'cases'].transform(lambda x: x.rolling(14, 1).mean())
#Calculamos la media móvil de los últimos 7 días
DatosOrdenadosPorFecha_df['moving7'] = DatosOrdenadosPorFecha_df[
    'cases'].transform(lambda x: x.rolling(7, 1).mean())

#Indicamos el intervalo de fechas que queremos utilizar
start_date = "2020-03-01"
end_date = "2020-08-13"
        def analysis(d):
            tar_li = ['AX', 'AY', 'AZ']
            ind = [
                'back_cut', 'back_drive', 'back_short', 'back_smash', 'fo_cut',
                'fo_drive', 'fo_short', 'fo_smash'
            ]
            print(d)
            tmp_str = d.split('|')
            mode = tmp_str[0]
            if mode in po_4:
                power = 4
            elif mode in po_2:
                power = 2
            elif mode in po_1:
                power = 1

            data = tmp_str[1].split('\n')
            #data.pop(0)
            index = data.pop(0)
            tmp_real_data = []

            for dat_num in range(len(data)):
                if data[dat_num] == '':
                    continue
                tmp_real_data.append(data[dat_num].split(','))

            df = pd.DataFrame(tmp_real_data)
            index_li = index.split(',')
            df.columns = index_li

            #now change str to float

            for y in index_li:
                df[y] = pd.to_numeric(df[y], downcast='float')

            tmp_li = []
            for i in range(len(df)):
                tmp = []
                #tmp.append((df['AX'][i]/1000)**power)
                #tmp.append((df['AY'][i]/1000)**power)
                #tmp.append((df['AZ'][i]/1000)**power)
                tmp.append((df['AX'][i]))
                tmp.append((df['AY'][i]))
                tmp.append((df['AZ'][i]))
                tmp_li.append(tmp)
            total_data = []
            total_data.append(tmp_li)
            total_data = np.array(total_data)

            if mode != 'lstm':
                nsamples, nx, ny = total_data.shape
                total_data = total_data.reshape((nsamples, nx * ny))

            if mode == 'lstm':
                with graph.as_default():
                    set_session(session)
                    res = lstm.predict(total_data)
                res = np.argmax(res, axis=-1)
            elif mode == 'svm':
                res = svm.predict(total_data)
                print(res)

            elif mode == 'rf':
                res = rf.predict(total_data)
                print(res)

            elif mode == 'nb':
                res = nb.predict(total_data)
                print(res)

            elif mode == 'xgboost':
                res = xgboost.predict(total_data)
                print(res)

            elif mode == 'knn':
                res = knn.predict(total_data)
                print(res)
            print(ind[res[0]])
            return ind[res[0]]
Example #35
0
for time_to_death_label in df['Time_To_Death']:
    ohe_label = [0, 0, 0, 0]
    # Find out which range the time belongs to by finding index of first truth
    time_class = [time_to_death_label < cutoff
                  for cutoff in cutoffs].index(True)
    Enc_labels.append(time_class)
    ohe_label[time_class] = 1.0
    ohe_labels.append(ohe_label)

# Add to dataframe
df['OHE_Time_To_Death'] = ohe_labels
df['Enc_Time_To_Death'] = Enc_labels

# Exclude all entries with "Missing" Died stats
df = df[~df['Died'].isin(['Missing'])]
df['Died'] = pd.to_numeric(df['Died'])

df.to_csv('/data/COVID/Labels/KCH_CXR_JPG_latest_dt.csv', index=False)

# Mobile vs Non-mobile
mobiles = df[df.Examination_Title == 'Chest - Xray (Mobile)']
non_mobiles = df[df.Examination_Title == 'Chest - X ray']

plt.figure(1)
plt.title('Mobiles')
plt.hist(mobiles.Enc_Time_To_Death)
plt.figure(2)
plt.title('Non-Mobiles')
plt.hist(non_mobiles.Enc_Time_To_Death)
plt.show()
Example #36
0
    return ga


g_smth = testGauss(t, V, fs)
plt.figure()
plt.plot(t[:-1], g_smth)
plt.xlabel('time(s)')
plt.ylabel('Velocities(Cm/s)')


def testButterworth(nyf, t, V, fs):
    b, a = butter(8, 6 / nyf, btype='low', analog=False)
    fl = filtfilt(b, a, V)

    #print (ssqe(fl, X, fs))
    return fl


test_butter = testButterworth(nyf, t, V, fs)
plt.figure()
plt.plot(t[:-1], test_butter)
plt.title('butterworth lowpass 6Hz')
plt.xlabel('time(s)')
plt.ylabel('Velocities(cm/s)')

plt.figure()
plt.plot(pd.to_numeric(t[:-1]), pd.to_numeric(V))
plt.title('Trajectories')
plt.xlabel('time(s)')
plt.ylabel('Velocities(Cm/s)')
plt.show()
Example #37
0
def convert_string_col_to_int(df, col):
    converted_df = pd.to_numeric(df[col]).astype('Int64')
    return converted_df
Example #38
0
 def time_downcast(self, dtype, downcast):
     to_numeric(self.data, downcast=downcast)
### End : Collect All Data at one place ---------------------------------------


### Start : Clean up the Data -------------------------------------------------
#### Check NaN values present
nan_val = df_allmonthsdata[df_allmonthsdata.isna().any(axis=1)]
print("Check NaN values present : ",nan_val.head())
#### Drop rows of NAN
df_allmonthsdata = df_allmonthsdata.dropna(how="all")
invalid_data = df_allmonthsdata[df_allmonthsdata["Order Date"].str[0:2] == 'Or']
print("Invalid values present : ", invalid_data.head())
# Now Select Correct Data which does not have "Or" value in Date
df_allmonthsdata = df_allmonthsdata[df_allmonthsdata["Order Date"].str[0:2] != 'Or']

#### Convert Columns to appropriate data type
df_allmonthsdata["Quantity Ordered"] = pd.to_numeric(df_allmonthsdata["Quantity Ordered"]) # Make int
df_allmonthsdata["Price Each"] = pd.to_numeric(df_allmonthsdata["Price Each"]) # Make int
### End : Clean up the Data -------------------------------------------------


### Start : find Order id which is present more than once, i.e. more than one product under same order----------------------------
needed_data = df_allmonthsdata[df_allmonthsdata["Order ID"].duplicated(keep=False)]
print("Retrieve & Collect only Duplicate : ",needed_data.head())
needed_data["list_of_products"] = needed_data.groupby("Order ID")["Product"].transform(lambda x: ','.join(x))
# Meaning of above line is, make a group of Products separated by comma and store in new Column, on the basis of similar Order IDs
print("Check list_of_products Column & All Data : ",needed_data.head())
# Above line shows Same order id more than once which is correct but we want all duplicates lines should be combined together
needed_data = needed_data[['Order ID','list_of_products']].drop_duplicates()
# Above will create DataFrame of two columns 'Order ID','list_of_products' with combined manner.
print("Show only one record, removed duplicates : ",needed_data.head())
sort_data = needed_data.sort_values(by=["list_of_products"])
Example #40
0
 def time_from_float(self, errors):
     to_numeric(self.float, errors=errors)
Example #41
0
def num_coerce(value):
    if ' ' in value:
        value = value.split(' ')[0]
    elif value == '----':
        value = 0
    return pd.to_numeric(value)
Example #42
0
    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None,
              verbose=False,
              mlflow=False):
        if update_epsilon:
            self.epsilon = update_epsilon

        if isinstance(data, pd.DataFrame):
            for col in data.columns:
                data[col] = pd.to_numeric(data[col], errors='ignore')
            self.pd_cols = data.columns
            self.pd_index = data.pd_index
            data = data.to_numpy()
        elif not isinstance(data, np.ndarray):
            raise ValueError("Data must be a numpy array or pandas dataframe")

        dataset = TensorDataset(
            torch.from_numpy(data.astype('float32')).to(self.device))
        dataloader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                shuffle=True,
                                drop_last=True)

        if not hasattr(self, "generator"):
            self.generator = Generator(self.latent_dim,
                                       data.shape[1],
                                       binary=self.binary).to(self.device)
        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(data.shape[1]).to(self.device)

        self.optimizer_d = optim.Adam(self.discriminator.parameters(),
                                      lr=4e-4,
                                      betas=(0.5, 0.9))
        if hasattr(self, "state_dict"):
            self.optimizer_d.load_state_dict(self.state_dict)

        if not hasattr(self, "privacy_engine"):
            privacy_engine = PrivacyEngine(
                self.discriminator,
                batch_size=self.batch_size,
                sample_size=len(data),
                alphas=[1 + x / 10.0
                        for x in range(1, 100)] + list(range(12, 64)),
                noise_multiplier=3.5,
                max_grad_norm=1.0,
                clip_per_layer=True).to(self.device)
        else:
            privacy_engine = self.privacy_engine

        privacy_engine.attach(self.optimizer_d)

        if hasattr(self, "privacy_engine"):
            epsilon, best_alpha = self.optimizer_d.privacy_engine.get_privacy_spent(
                self.delta)
        else:
            epsilon = 0

        if not hasattr(self, "optimizer_g"):
            self.optimizer_g = optim.Adam(self.generator.parameters(), lr=1e-4)

        criterion = nn.BCELoss()

        for epoch in range(self.epochs):

            if self.epsilon < epsilon:
                break

            for i, data in enumerate(dataloader):
                self.discriminator.zero_grad()

                real_data = data[0].to(self.device)

                # train with fake data
                noise = torch.randn(self.batch_size,
                                    self.latent_dim,
                                    1,
                                    1,
                                    device=self.device)
                noise = noise.view(-1, self.latent_dim)
                fake_data = self.generator(noise)
                label_fake = torch.full((self.batch_size, 1),
                                        0,
                                        dtype=torch.float,
                                        device=self.device)
                output = self.discriminator(fake_data.detach())
                loss_d_fake = criterion(output, label_fake)
                loss_d_fake.backward()
                self.optimizer_d.step()

                # train with real data
                label_true = torch.full((self.batch_size, 1),
                                        1,
                                        dtype=torch.float,
                                        device=self.device)
                output = self.discriminator(real_data.float())
                loss_d_real = criterion(output, label_true)
                loss_d_real.backward()
                self.optimizer_d.step()

                loss_d = loss_d_real + loss_d_fake

                max_grad_norm = []
                for p in self.discriminator.parameters():
                    param_norm = p.grad.data.norm(2).item()
                    max_grad_norm.append(param_norm)

                privacy_engine.max_grad_norm = max_grad_norm

                # train generator
                self.generator.zero_grad()
                label_g = torch.full((self.batch_size, 1),
                                     1,
                                     dtype=torch.float,
                                     device=self.device)
                output_g = self.discriminator(fake_data)
                loss_g = criterion(output_g, label_g)
                loss_g.backward()
                self.optimizer_g.step()

                # manually clear gradients
                for p in self.discriminator.parameters():
                    if hasattr(p, "grad_sample"):
                        del p.grad_sample
                # autograd_grad_sample.clear_backprops(discriminator)

                if self.delta is None:
                    self.delta = 1 / data.shape[0]

                eps, best_alpha = self.optimizer_d.privacy_engine.get_privacy_spent(
                    self.delta)
                self.alpha = best_alpha

            if (verbose):
                print('eps: {:f} \t alpha: {:f} \t G: {:f} \t D: {:f}'.format(
                    eps, best_alpha,
                    loss_g.detach().cpu(),
                    loss_d.detach().cpu()))

            if (mlflow):
                import mlflow
                mlflow.log_metric("loss_g",
                                  float(loss_g.detach().cpu()),
                                  step=epoch)
                mlflow.log_metric("loss_d",
                                  float(loss_d.detach().cpu()),
                                  step=epoch)
                mlflow.log_metric("epsilon", float(eps), step=epoch)

            if self.epsilon < eps:
                break

        privacy_engine.detach()
        self.state_dict = self.optimizer_d.state_dict()
        self.privacy_engine = privacy_engine
# TODO: Load up the table, and extract the dataset
# out of it. If you're having issues with this, look
# carefully at the sample code provided in the reading
#
# .. your code here ..
#df = pd.read_html('http://espn.go.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2')
htmlstr = 'http://espn.go.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2'
df = pd.read_html(htmlstr)[0]
columns = df.iloc[1, :]
df.columns = columns
col_num = len(columns)
df_1 = df.dropna(thresh=col_num - 4)  # drop最少4个NAN的值
df_1 = df_1.drop(df.RK == 'RK')
#df_1 = df_1[(df.RK != 'RK')]
#df_1 = df_1.iloc[:,1:]
df_1.iloc[:, 1] = pd.to_numeric(df.iloc[:, 1], errors='coerce')
print(df_1.describe())
#print(df_1.loc[15:16,'GP'])

#df_not = df[(df.PLAYER != 'PP') & (df.TEAM != 'SH')]

# TODO: Rename the columns so that they match the
# column definitions provided to you on the website
#
# .. your code here ..

# TODO: Get rid of any row that has at least 4 NANs in it
#
# .. your code here ..

# TODO: At this point, look through your dataset by printing
Example #44
0
column_names = ['id_record', 'id_car', 'status', 'lat', 'lon', 'time']
use_columns = ['id_car', 'status', 'lat', 'lon', 'time']
# data_types = [str,int,str,float,float,str]
arr = np.empty(shape=(len(os.listdir('/home/martin/MOBILITY/data/traces')), ),
               dtype=object)
for idx, filename in tqdm(
        enumerate(os.listdir('/home/martin/MOBILITY/data/traces'))):
    abs_filename = os.path.join('/home/martin/MOBILITY/data/traces', filename)
    filename_parts = abs_filename.split(sep='.')
    file_extension = filename_parts[-2]
    df = pd.read_csv(abs_filename,
                     header=None,
                     names=column_names,
                     usecols=['id_car', 'status', 'lat', 'lon', 'time'],
                     converters={'status': f})
    df['id_car'] = pd.to_numeric(file_extension + df['id_car'].astype(str))
    # print(df.head())
    # print(df.dtypes)
    # q = sort_one_by_one(df,'id_car','time')
    # print(q.head())
    filtered = df[df.loc[:, 'status']].loc[:, ['id_car', 'lat', 'lon', 'time']]
    filtered.sort_values(by='time', ascending=True, inplace=True)
    filtered.sort_values(by='id_car',
                         kind='mergesort',
                         ascending=True,
                         inplace=True)
    # print(filtered)
    # ls.append(filtered)
    arr[idx] = filtered

df = pd.concat(arr, ignore_index=True)
Example #45
0
df.describe()

# We have a number of demographics for each individual as well as the products they currently own. To make a test set, I will separate the last month from this training data, and create a feature that indicates whether or not a product was newly purchased. First convert the dates. There's `fecha_dato`, the row-identifier date, and `fecha_alta`, the date that the customer joined.

# In[ ]:

df["fecha_dato"] = pd.to_datetime(df["fecha_dato"], format="%Y-%m-%d")
df["fecha_alta"] = pd.to_datetime(df["fecha_alta"], format="%Y-%m-%d")
df["fecha_dato"].unique()

# I printed the values just to double check the dates were in standard Year-Month-Day format. I expect that customers will be more likely to buy products at certain months of the year (Christmas bonuses?), so let's add a month column. I don't think the month that they joined matters, so just do it for one.

# In[ ]:

df["month"] = pd.DatetimeIndex(df["fecha_dato"]).month
df["age"] = pd.to_numeric(df["age"], errors="coerce")

# Are there any columns missing values?

# In[ ]:

df.isnull().any()

# Definitely. Onto data cleaning.
#
# ## Data Cleaning
#
# Going down the list, start with `age`

# In[ ]:
Example #46
0
def base_load_sessions(
    item_df,
    csv_file,
    secondary_csv_file,
    output_file,
    shared_output_file,
    label_encoders,
    hot_encoders,
    nrows,
):
    from train_recommender import RAW_DATA_PATH, DATA_PATH

    pickle_path = os.path.join(DATA_PATH, output_file)
    shared_pickle_path = os.path.join(DATA_PATH, shared_output_file)

    is_test = secondary_csv_file is None

    if os.path.exists(pickle_path):
        result = pickle.load(open(pickle_path, "rb"))
        result.update(pickle.load(open(shared_pickle_path, "rb")))
        return result
    else:
        data_path = os.path.join(RAW_DATA_PATH, csv_file)
        print("load csv")
        raw_df = pd.read_csv(data_path, sep=',', nrows=nrows)

        if secondary_csv_file is not None:
            secondary_df = pd.read_csv(os.path.join(RAW_DATA_PATH,
                                                    secondary_csv_file),
                                       sep=',',
                                       nrows=nrows)
            raw_df = pd.concat([raw_df, secondary_df], ignore_index=True)
        raw_df = split_row(raw_df, column='city', sep=',')

        # extract search_for poi impression into own column
        prepare_action_types = [
            'search for poi',
            'change of sort order',
            'filter selection',
            'search for destination',
        ]
        raw_df = prepare_reference(raw_df, prepare_action_types)

        # encode labels
        hot_encoders, label_encoders = hot_encode_labels(
            raw_df,
            columns=[
                'session_id',
                'action_type',
                'city_0',
                'city_1',
                'platform',
                'device',
            ],
            label_encoders=label_encoders,
            hot_encoders=hot_encoders)
        print("Remove invalid references...")
        raw_df['reference'] = pd.to_numeric(
            raw_df['reference'], errors='coerce').fillna(-1).astype(int)

        clickout_type = label_encoders['action_type'].transform(
            ['clickout item'])[0]
        print("filter references which do not exist")

        referencing_action_type = label_encoders['action_type'].transform(
            get_referencing_action_types(is_test))
        item_properties = item_df.loc[raw_df['reference']]
        item_properties.reset_index(inplace=True, drop=True)
        raw_df = raw_df[~(
            (item_properties[0].isnull()) &
            (raw_df['action_type'].isin(referencing_action_type)))]
        raw_df.reset_index(inplace=True)

        print("filter session_ids where the last entry is not a clickout")
        next_session_id = raw_df["session_id"].shift(-1)
        to_delete = raw_df[(raw_df['session_id'] != next_session_id) & (
            raw_df['action_type'] != clickout_type)]['session_id']
        raw_df = raw_df[~raw_df['session_id'].isin(to_delete)]
        raw_df.reset_index(inplace=True, drop=True)

        print("prepare reference item_ids")
        item_properties = item_df.loc[raw_df['reference']]
        item_properties.reset_index(inplace=True, drop=True)
        item_properties.fillna(0.0, inplace=True)

        raw_df.drop([
            'index',
        ], axis=1, inplace=True)

        print("groupby")
        grouped = raw_df.groupby(by='session_id')

        print("shuffle & extract session ids")
        session_sizes = grouped[['step']].count()
        session_sizes = session_sizes[session_sizes['step'] > 1]
        noise = np.random.normal(0, 2, [len(session_sizes), 1]).astype(
            int)  # locally shuffle by sorting by "noised length"
        noised_session_sizes = session_sizes + noise
        noised_session_sizes.sort_values(by='step', inplace=True)
        train_session_ids = np.array(list(noised_session_sizes.index))

        print("write to disk")

        result = {
            "session": raw_df,
            "relevant_session_ids": train_session_ids,
            "item_properties": item_properties,
            "grouped": grouped,
        }
        shared_result = {
            "hot_encoders": hot_encoders,
            "label_encoders": label_encoders,
        }

        pickle.dump(result, open(pickle_path, "wb"), protocol=4)
        pickle.dump(shared_result, open(shared_pickle_path, "wb"), protocol=4)
        result.update(shared_result)

    return result
							elif temp[i-j,0] == 'methylated' and temp[i-j,6] == False:
								paired.append(temp[i])
								paired.append(temp[i-j])
								temp[i,6] = True
								temp[i-j,6] = True



	######
	#
	#	Convert paired sites into Dataframe
	paired_all = pd.DataFrame(paired, columns = ['CGstatus','methylation','motif','start','strand','type','paired'])
	paired_all.drop(['paired'],axis = 1, inplace = True)
	# make sign variable for downstream analysis
	paired_all['sign'] = paired_all['strand']+'1'
	paired_all['sign'] = pd.to_numeric(paired_all['sign'])

	# save output file
	paired_all.to_csv(output_path+'/ara_paired_10reads_%s_%s_k%s_%s.txt' %(mask_str, context, k, chromosome), sep = '\t', header = True)











Example #48
0
def calculator_road(city_code):
    roads_city = pd.read_csv("全國路名/" + city_code + "_road.csv")
    df = pd.read_csv("concate_csvs/" + city_code + ".csv")
    df = df.dropna(subset=[
        '鄉鎮市區', "土地區段位置建物區段門牌", '交易年月日', '建物移轉總面積平方公尺', '交易筆棟數', '單價元平方公尺'
    ])

    roads = []
    for i in range(len(roads_city)):
        site_id = roads_city["site_id"][i]
        road = roads_city["road"][i]
        roads.append(site_id + road)

    #取出距今5年前資料
    from datetime import datetime
    y = str(int('{:%Y}'.format(datetime.today())) - 1911)
    month_day = '{:%m%d}'.format(datetime.today())
    today_date_string = y + month_day
    five_year_ago_date = str(int(today_date_string) - 50000)
    filter_5年內資料 = pd.to_numeric(
        df["交易年月日"]) > (int(today_date_string) - 50000)
    df_for_analysis = df[filter_5年內資料]

    sd = pd.DataFrame(columns=["road", "每平方公尺標準差",
                               "每平方公尺年成長率"])  #存放各地區 每平方公尺標準差

    count = 1
    for location in roads:
        fliter_location = df_for_analysis["土地區段位置建物區段門牌"].str.contains(
            location)
        print(location)
        print(count / len(roads))
        count += 1
        if any(fliter_location):  #當有找到相符路段才執行dataframe操握
            df_location = df_for_analysis[fliter_location]
            df_location["單價元平方公尺"] = pd.to_numeric(df_location["單價元平方公尺"])
            df_location["建物移轉總面積平方公尺"] = pd.to_numeric(
                df_location["建物移轉總面積平方公尺"])
            df_location["土地數"] = df_location["交易筆棟數"].str.get(2)
            df_location["土地數"] = pd.to_numeric(df_location["土地數"])
            df_location["平均土地面積of一筆交易"] = df_location[
                "建物移轉總面積平方公尺"] // df_location["土地數"]
            #計算標準差
            sd_of_location = df_location["單價元平方公尺"].std()

            #計算年成長率
            mean_value_of_each_year = []
            for i in range(5):
                fliter_certain_year = (
                    pd.to_numeric(df_location['交易年月日']) >=
                    int(five_year_ago_date) + i * 10000) & (
                        pd.to_numeric(df_location['交易年月日']) <
                        int(five_year_ago_date) + 10000 + i * 10000)
                mean_value_of_each_year.append(
                    df_location[fliter_certain_year].mean()[1])
            gross_rate_of_each_year = []
            for i in range(1, 5):
                gross_rate_of_each_year.append(
                    (mean_value_of_each_year[i] -
                     mean_value_of_each_year[i - 1]) /
                    mean_value_of_each_year[i])
            annual_gross_rate = sum(gross_rate_of_each_year) / len(
                gross_rate_of_each_year)

            mean_value_of_location = df_location["單價元平方公尺"].mean()
            mean_area_of_location = df_location["平均土地面積of一筆交易"].mean()
            tempt = pd.DataFrame(
                {
                    "road": location,
                    "每平方公尺標準差": sd_of_location,
                    "每平方公尺年成長率": annual_gross_rate,
                    "meanValue": mean_value_of_location,
                    "mean_area": mean_area_of_location
                },
                index=[1])

            sd = sd.append(tempt, ignore_index=True)
        '''

def get_result():
    result = pd.DataFrame()
    for data in get_datas():
        result = result.append(data)
    return result


result = get_result()
print('空数据有:', result.isnull().any().sum())
result.columns = ['日期', '最高温度', '最低温度', '天气状况', '风向']
result.head(10)

result['日期'] = pd.to_datetime(result['日期'])
result['最高温度'] = pd.to_numeric(result['最高温度'])
result['最低温度'] = pd.to_numeric(result['最低温度'])
result['平均温度'] = (result['最高温度'] + result['最低温度']) / 2

#result.info()

sns.distplot(result['平均温度'])

# In[102]:

sns.countplot(result['天气状况'])

# In[110]:

result['是否降水'] = result['天气状况'].apply(
    lambda x: '未降水' if x in ['晴', '多云', '阴', '雾', '浮尘', '霾', '扬沙'] else '降水')
Example #50
0
def usbonds_command():
    """US bonds overview [Wall St. Journal]"""

    # Debug user input
    if imps.DEBUG:
        logger.debug("econ-usbonds")

    # Retrieve data
    df = wsj_model.us_bonds()

    # Check for argument
    if df.empty:
        raise Exception("No available data found")

    df["Rate (%)"] = pd.to_numeric(df["Rate (%)"].astype(float))
    df["Yld (%)"] = pd.to_numeric(df["Yld (%)"].astype(float))
    df["Yld Chg (%)"] = pd.to_numeric(df["Yld Chg (%)"].astype(float))

    formats = {
        "Rate (%)": "{:.2f}%",
        "Yld (%)": "{:.2f}%",
        "Yld Chg (%)": "<b>{:.2f}%</b>",
    }
    for col, value in formats.items():
        df[col] = df[col].map(lambda x: value.format(x))  # pylint: disable=W0640

    df = df.fillna("")
    df.set_index(" ", inplace=True)

    df = df.set_axis(
        [
            "Rate",
            "Yld",
            "Yld Chg",
        ],
        axis="columns",
    )

    font_color = ["white"] * 3 + [[
        "#e4003a" if boolv else "#00ACFF"
        for boolv in df["Yld Chg"].str.contains("-")
    ]]

    fig = imps.plot_df(
        df,
        fig_size=(550, (40 + (40 * len(df.index)))),
        col_width=[4, 2, 2, 2.1],
        tbl_header=imps.PLT_TBL_HEADER,
        tbl_cells=imps.PLT_TBL_CELLS,
        font=imps.PLT_TBL_FONT,
        row_fill_color=imps.PLT_TBL_ROW_COLORS,
        paper_bgcolor="rgba(0, 0, 0, 0)",
    )
    fig.update_traces(cells=(dict(
        align=["center", "right"],
        font=dict(color=font_color),
    )))
    fig.update_traces(cells=(dict(align=["center", "right"])))

    imagefile = imps.save_image("econ-usbonds.png", fig)
    return {
        "title": "Economy: [WSJ] US Bonds",
        "imagefile": imagefile,
    }
Example #51
0
@author: T
"""

import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"
mam = pd.read_csv(url, header= None)

mam.columns = ["BI-RADS", "Age", "Shape", "Margin", "Density", "Severity"]

mam.dtypes

mam.loc [:,"BI-RADS"] = pd.to_numeric(mam.loc[:,"BI-RADS"], errors= 'coerce')

hasnan = np.isnan(mam.loc[:, "BI-RADS"])

print(hasnan)

mam.loc[hasnan,"BI-RADS"] = np.median(mam.loc[:, "BI-RADS"])

plt.hist(mam.loc[:, "BI-RADS"])

toohigh = mam.loc[:, "BI-RADS"] > 6

mam.loc[toohigh, "BI-RADS"] = 6

import pandas as pd
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
plt.title(
    "Distribution of the Classes based on Clump Thickness and Epithelial Cell Size"
)
plt.show()

# ## Data Preprocessing and Selection

# In[6]:

cancer_data.dtypes

# It looks like the __BareNuc__ column includes some values that are not numerical. We can drop those rows:

# In[4]:

cancer_data = cancer_data[pd.to_numeric(cancer_data['BareNuc'],
                                        errors='coerce').notnull()]
cancer_data['BareNuc'] = cancer_data['BareNuc'].astype('int')
cancer_data.dtypes

# In[5]:

feature_df = cancer_data[[
    'Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc',
    'BlandChrom', 'NormNucl', 'Mit'
]]
X = np.asarray(feature_df)
X[0:5]

# In[6]:

cancer_data['Class'] = cancer_data['Class'].astype('int')
Example #53
0
    if not nexTour.empty:
        nt1 = pd.DataFrame(nexTour['stats'].tolist())
        nt1['id'] = nexTour['id']
        nt1['gameweek'] = i
        nt1.index = nt1['gameweek']*1000+nt1['id']
        Gameweeks = Gameweeks.append(nt1)
        print(i)


teams = dict(zip(pd.DataFrame(d1['teams'])['id'],pd.DataFrame(d1['teams'])['name']))
players = dict(zip(bigTable['id'],bigTable['full_name']))
teamplayers = dict(zip(bigTable['id'],bigTable['team']))

Gameweeks['team'] = [teamplayers[i] for i in Gameweeks['id']]

Gameweeks['threat'] = pd.to_numeric(Gameweeks['threat'])
Gameweeks['creativity'] = pd.to_numeric(Gameweeks['creativity'])

Gameweeks['team_a'] = [int(Fixtures[(Fixtures['event'] == Gameweeks.iloc[i,20]) &                                          ((Fixtures['team_a'] == Gameweeks.iloc[i,21])|                                          (Fixtures['team_h'] == Gameweeks.iloc[i,21]))]['team_a'])                             for i in range(len(Gameweeks))]
Gameweeks['team_h'] = [int(Fixtures[(Fixtures['event'] == Gameweeks.iloc[i,20]) &                                          ((Fixtures['team_a'] == Gameweeks.iloc[i,21])|                                          (Fixtures['team_h'] == Gameweeks.iloc[i,21]))]['team_h'])                             for i in range(len(Gameweeks))]

Gameweeks['teamAgainst'] = [Gameweeks.at[i,'team_a'] if Gameweeks.at[i,'team'] == Gameweeks.at[i,'team_h']                            else Gameweeks.at[i,'team_h']                            for i in Gameweeks.index]
Gameweeks['side'] = ['home' if Gameweeks.at[i,'team'] == Gameweeks.at[i,'team_h']                            else 'away'                            for i in Gameweeks.index]

del Gameweeks['team_a']
del Gameweeks['team_h']


Gameweeks.to_csv(Path('in/fplgameweeks.csv'))
Gameweeks
Example #54
0
def main():
    sf_gdf = gpd.read_file("san-francisco.geojson")
    sf_gdf['pop2010'] = pd.to_numeric(sf_gdf['pop2010'], downcast='integer')

    sf_map = folium.Map([37.7556, -122.4399], zoom_start=13)

    folium.GeoJson('san-francisco.geojson', name='geojson').add_to(sf_map)

    folium.GeoJson(data='san-francisco.geojson',
                   name='geojson',
                   style_function=style).add_to(sf_map)

    # Add labels
    manual_label = {5, 8, 9, 12, 15, 26, 27}

    for index, row in sf_gdf.iterrows():
        if index not in manual_label:
            folium.CircleMarker(get_centroid(row),
                                radius=POINT_RADIUS,
                                color='black',
                                fill=True,
                                fill_opacity=1).add_to(sf_map)
            add_label(sf_map, get_centroid(row), row['zip_code'])

    # 94104
    row = sf_gdf.iloc[12]
    add_label(sf_map, (37.794, -122.363705),
              row['zip_code'],
              icon_anchor=(0, 13))
    centroid = get_centroid(row)
    folium.CircleMarker(centroid,
                        radius=POINT_RADIUS,
                        color='black',
                        fill=True,
                        fill_opacity=1).add_to(sf_map)
    folium.PolyLine(locations=[centroid, (37.794, centroid[1])],
                    color='black',
                    weight=LINE_WEIGHT).add_to(sf_map)
    folium.PolyLine(locations=[(37.794, centroid[1]), (37.794, -122.363705)],
                    color='black',
                    weight=LINE_WEIGHT).add_to(sf_map)

    # 94108
    row = sf_gdf.iloc[15]
    add_label(sf_map, (37.797, -122.363705),
              row['zip_code'],
              icon_anchor=(0, 25))
    centroid = get_centroid(row)
    folium.CircleMarker(centroid,
                        radius=POINT_RADIUS,
                        color='black',
                        fill=True,
                        fill_opacity=1).add_to(sf_map)
    folium.PolyLine(locations=[centroid, (37.797, centroid[1])],
                    color='black',
                    weight=LINE_WEIGHT).add_to(sf_map)
    folium.PolyLine(locations=[(37.797, centroid[1]), (37.797, -122.363705)],
                    color='black',
                    weight=LINE_WEIGHT).add_to(sf_map)

    sf_map.save('index.html')
    'PL': 14901,
    'PT': 23408,
    'RO': 12301,
    'RU': 11162,
    'SA': 22865,
    'SE': 54608,
    'TH': 7274,
    'TR': 9370,
    'TW': 24827,
    'UA': 3592,
    'US': 65111,
    'VN': 2740,
    'ZA': 11300,
    'CO': 6500
})
df.gdpCountry = pd.to_numeric(df.gdpCountry, errors='coerce')
df['gdpCountry'] = df['gdpCountry'].fillna(11335)
df['gdpCountry'] = pd.cut(df.gdpCountry,
                          bins=[0, 29960, 50000, 150000],
                          labels=[0, 1, 2])

#End Yev Gdp

device_map = {
    'IPhone7': 0,
    'IPhone7Plus': 0,
    'IPhone8Plus': 0,
    'IPhone6S': 0,
    'IPhoneSE': 0,
    'IPhone8': 0,
    'IPhone6SPlus': 0,
Example #56
0
def main(task):
    if 'tune_problem' in task:
        # Tune Knapsack problem
        problem_size = 50
        weights = [idx for idx in range(1, problem_size + 1)]
        values = [idx for idx in range(1, problem_size + 1)]
        max_weight_pct_list = np.arange(0.1, 1, 0.05)
        knapsack_tuning_fitness = []
        knapsack_tuning_time = []
        knapsack_tuning_fevals = []
        for max_weight_pct in max_weight_pct_list:
            fitness = mlrose.Knapsack(weights, values, max_weight_pct)
            problem = mlrose.DiscreteOpt(problem_size,
                                         fitness,
                                         maximize=True,
                                         max_val=2)
            experiment_name = 'knapsack_tuning_weight_pct_' + str(
                max_weight_pct)
            temperature_list = np.arange(1, 50, 1)
            knapsack = runners.SARunner(problem=problem,
                                        experiment_name=experiment_name,
                                        output_directory='knapsack',
                                        seed=27,
                                        iteration_list=[5000],
                                        max_attempts=50,
                                        temperature_list=temperature_list)
            # the two data frames will contain the results
            knapsack_run_stats, knapsack_run_curves = knapsack.run()
            knapsack_tuning_fitness.append(knapsack_run_curves.loc[
                knapsack_run_curves['Fitness'].idxmax()]['Fitness'])
            knapsack_tuning_time.append(knapsack_run_curves.loc[
                knapsack_run_curves['Time'].idxmax()]['Time'])
            knapsack_tuning_fevals.append(2 * knapsack_run_curves.loc[
                knapsack_run_curves['Iteration'].idxmax()]['Iteration'])

        plt.rc("font", size=8)
        plt.rc("axes", titlesize=12)
        plt.rc("axes", labelsize=10)
        plt.rc("xtick", labelsize=8)
        plt.rc("ytick", labelsize=8)
        plt.rc("legend", fontsize=11)
        plt.rc("figure", titlesize=11)
        fig, ax = plt.subplots(1, 3, figsize=(10, 3.5))
        fig.suptitle('Knapsack Tuning w/ Simulated Annealing Optimizer',
                     fontsize=14)
        ax[0].scatter(max_weight_pct_list,
                      knapsack_tuning_fitness,
                      c='r',
                      marker='x',
                      s=10)
        ax[0].set(xlabel='Max Weight %', ylabel='Max Fitness')
        ax[1].scatter(max_weight_pct_list,
                      knapsack_tuning_time,
                      c='g',
                      marker='o',
                      s=10)
        ax[1].set(xlabel='Max Weight %', ylabel='Max Runtime (s)')
        ax[2].scatter(max_weight_pct_list,
                      knapsack_tuning_fevals,
                      c='b',
                      marker='+')
        ax[2].set(xlabel='Max Weight %', ylabel='Max Function Evaluations')
        ax[2].yaxis.tick_right()
        plt.show()

        return
    if 'tuning_plots' in task:
        # FOUR PEAKS GOOD FOR GENETIC
        # Tune Algorithms
        problem_size = 50

        # Knapsack
        weights = [idx for idx in range(1, problem_size + 1)]
        print(weights)
        #weights = np.ones(100)
        values = [idx for idx in range(1, problem_size + 1)]
        #values = np.arange(1, 101)
        max_weight_pct = 0.3
        knapsack_fitness = mlrose.Knapsack(weights, values, max_weight_pct)
        #state = np.array([1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0])
        #problem = mlrose.DiscreteOpt(problem_size, four_peaks_fitness, maximize=True, max_val=2)
        #temperature_list = np.arange(0.1, 2, 0.1)
        best_fitness_list = []
        #for size in problem_size_list:
        problem = mlrose.DiscreteOpt(problem_size,
                                     knapsack_fitness,
                                     maximize=True,
                                     max_val=2)

        problem_size = 50
        rhc_fitness_tuning_list = []
        rhc_param_tuning_list = []
        rhc_feval_tuning_list = []
        time_tuning_list = []
        asdf_list = []
        fdsa_list = []
        experiment_name = 'rhc_knapsack_tuning_size_' + str(problem_size)
        #restart_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
        restart_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        rhc = runners.RHCRunner(problem=problem,
                                experiment_name=experiment_name,
                                output_directory='knapsack',
                                seed=27,
                                iteration_list=[5000],
                                max_attempts=125,
                                restart_list=restart_list)
        # the two data frames will contain the results
        rhc_run_stats, rhc_run_curves = rhc.run()
        for restart in restart_list:
            this_temp_df = rhc_run_curves.loc[rhc_run_curves['Restarts'] ==
                                              restart]
            this_temp_df[
                'Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[
                    this_temp_df['Iteration'].idxmin()]['Iteration'] + 1
            rhc_fitness_tuning_list.append(
                this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness'])
            rhc_param_tuning_list.append(restart)
            time_tuning_list.append(
                this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time'])
            rhc_feval_tuning_list.append(3 * this_temp_df.loc[
                this_temp_df['Iteration'].idxmax()]['Iteration'])
            asdf_list.append(this_temp_df['Fitness'])
            fdsa_list.append(this_temp_df['Iteration'])
        # plt.rc("font", size=8)
        # plt.rc("axes", titlesize=12)
        # plt.rc("axes", labelsize=10)
        # plt.rc("xtick", labelsize=8)
        # plt.rc("ytick", labelsize=8)
        # plt.rc("legend", fontsize=8)
        # plt.rc("figure", titlesize=11)
        # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
        # fig, ax = plt.subplots(1,3,figsize=(12,3.5))
        # fig.suptitle('RHC Restarts Tuning, problem_size = ' + str(problem_size))
        # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10)
        # ax[0].set(xlabel='Restarts', ylabel = 'Time')

        # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10)
        # ax[1].set(xlabel='Restarts', ylabel = 'Fitness')

        # ax[2].scatter(param_tuning_list, feval_tuning_feval, c='g', marker='o', s=10)
        # ax[2].set(xlabel='Restartsc', ylabel = 'Function Evaluations')
        # ax[2].yaxis.tick_right()

        # plt.show()

        # fig, ax = plt.subplots()
        # ax.scatter(fdsa_list[7], asdf_list[7])
        # ax.set(xlabel='Iteration', ylabel = 'Fitness')
        # plt.show()
        # problem_size = 50

        sa_fitness_tuning_list = []
        sa_param_tuning_list = []
        time_tuning_list = []
        sa_feval_tuning_list = []
        asdf_list = []
        fdsa_list = []
        experiment_name = 'sa_knapsack_tuning_size_' + str(problem_size)
        temperature_list = np.arange(1, 50, 0.5)
        sa = runners.SARunner(problem=problem,
                              experiment_name=experiment_name,
                              output_directory='knapsack',
                              seed=27,
                              iteration_list=[1000],
                              max_attempts=50,
                              temperature_list=temperature_list)
        #decay_list=mlrose.GeomDecay(init_temp=1.1))
        #temperature_list=[1, 10, 50, 100, 250, 500, 1000, 2500, 5000, 10000])
        #temperature_list=[1, 10, 50, 100, 250, 500, 1000, 2500, 5000, 10000])
        # the two data frames will contain the results
        df_run_stats, df_run_curves = sa.run()
        df_run_curves['Temperature'] = pd.to_numeric(
            df_run_curves['Temperature'].astype(str).astype(float))
        for temp in temperature_list:
            this_temp_df = df_run_curves.loc[df_run_curves['Temperature'] ==
                                             temp]
            this_temp_df[
                'Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[
                    this_temp_df['Iteration'].idxmin()]['Iteration'] + 1
            sa_fitness_tuning_list.append(
                this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness'])
            sa_param_tuning_list.append(temp)
            sa_feval_tuning_list.append(2 * this_temp_df.loc[
                this_temp_df['Iteration'].idxmax()]['Iteration'])
            time_tuning_list.append(
                this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time'])
            asdf_list.append(this_temp_df['Fitness'])
            fdsa_list.append(this_temp_df['Iteration'])
        # plt.rc("font", size=8)
        # plt.rc("axes", titlesize=12)
        # plt.rc("axes", labelsize=10)
        # plt.rc("xtick", labelsize=8)
        # plt.rc("ytick", labelsize=8)
        # plt.rc("legend", fontsize=8)
        # plt.rc("figure", titlesize=11)
        # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
        # fig, ax = plt.subplots(1,3,figsize=(12,3.5))
        # fig.suptitle('SA Temperature Tuning, problem_size = ' + str(problem_size))
        # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10)
        # ax[0].set(xlabel='Temperature', ylabel = 'Time')

        # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10)
        # ax[1].set(xlabel='Temperature', ylabel = 'Fitness')

        # ax[2].scatter(param_tuning_list, feval_tuning_list, c='g', marker='o', s=10)
        # ax[2].set(xlabel='Temperature', ylabel = 'Function Evaluations')
        # ax[2].yaxis.tick_right()

        # plt.show()

        # fig, ax = plt.subplots()
        # ax.scatter(fdsa_list[17], asdf_list[17])
        # ax.set(xlabel='Iteration', ylabel = 'Fitness')
        # plt.show()

        ga_fitness_tuning_list = []
        ga_param_tuning_list = []
        time_tuning_list = []
        ga_feval_tuning_list = []
        asdf_list = []
        fdsa_list = []
        experiment_name = 'ga_knapsack_tuning_size_' + str(problem_size)
        population_sizes_list = 100,
        mutation_rates_list = np.arange(0.05, 1.0, 0.05)
        ga = runners.GARunner(problem=problem,
                              experiment_name=experiment_name,
                              output_directory='knapsack',
                              seed=27,
                              iteration_list=[100],
                              population_sizes=population_sizes_list,
                              mutation_rates=mutation_rates_list,
                              max_attempts=5)

        # the two data frames will contain the results
        df_run_stats, df_run_curves = ga.run()

        # for rate in mutation_rates_list:
        #     this_temp_df = df_run_curves.loc[df_run_curves['Mutation Rate'] == rate]
        #     this_temp_df['Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[this_temp_df['Iteration'].idxmin()]['Iteration'] + 1
        #     ga_fitness_tuning_list.append(this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness'])
        #     ga_param_tuning_list.append(rate)
        #     feval_tuning_list.append(population_sizes_list[0] * this_temp_df.loc[this_temp_df['Iteration'].idxmax()]['Iteration'])
        #     time_tuning_list.append(this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time'])
        #     asdf_list.append(this_temp_df['Fitness'])
        #     fdsa_list.append(this_temp_df['Iteration'])
        # print(time_tuning_list)
        # plt.rc("font", size=8)
        # plt.rc("axes", titlesize=12)
        # plt.rc("axes", labelsize=10)
        # plt.rc("xtick", labelsize=8)
        # plt.rc("ytick", labelsize=8)
        # plt.rc("legend", fontsize=8)
        # plt.rc("figure", titlesize=11)
        # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
        # fig, ax = plt.subplots(1,3,figsize=(12,3.5))
        # fig.suptitle('GA Mutation Rate Tuning, problem_size = ' + str(problem_size))
        # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10)
        # ax[0].set(xlabel='Mutation Rate', ylabel = 'Time (s)')

        # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10)
        # ax[1].set(xlabel='Mutation Rate', ylabel = 'Fitness')

        # ax[2].scatter(param_tuning_list, feval_tuning_list, c='g', marker='o', s=10)
        # ax[2].set(xlabel='Mutation Rate', ylabel = 'Function Evaluations')
        # ax[2].yaxis.tick_right()

        # plt.show()

        # fig, ax = plt.subplots()
        # ax.scatter(fdsa_list[17], asdf_list[17])
        # ax.set(xlabel='Iteration', ylabel = 'Fitness')
        # plt.show()

        # Tune population size
        ga_population_tuning_fitness = []
        ga_population_tuning_time = []
        ga_population_tuning_feval = []
        population_sizes_list = np.arange(10, 500, 10)
        for population_size in population_sizes_list:
            experiment_name = 'ga_knapsack_tuning_population_size_' + str(
                problem_size)
            mutation_rates_list = [0.1]
            ga = runners.GARunner(problem=problem,
                                  experiment_name=experiment_name,
                                  output_directory='knapsack',
                                  seed=27,
                                  iteration_list=[500],
                                  population_sizes=[int(population_size)],
                                  mutation_rates=mutation_rates_list,
                                  max_attempts=10)

            # the two data frames will contain the results
            ga_run_stats, ga_run_curves = ga.run()
            ga_population_tuning_fitness.append(ga_run_curves.loc[
                ga_run_curves['Fitness'].idxmax()]['Fitness'])
            ga_population_tuning_time.append(
                ga_run_curves.loc[ga_run_curves['Time'].idxmax()]['Time'])
            ga_population_tuning_feval.append(
                population_size * ga_run_curves.loc[
                    ga_run_curves['Iteration'].idxmax()]['Iteration'])

        # plt.rc("font", size=8)
        # plt.rc("axes", titlesize=12)
        # plt.rc("axes", labelsize=10)
        # plt.rc("xtick", labelsize=8)
        # plt.rc("ytick", labelsize=8)
        # plt.rc("legend", fontsize=8)
        # plt.rc("figure", titlesize=11)
        # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
        # fig, ax = plt.subplots(1,3,figsize=(12,3.5))
        # fig.suptitle('GA Population Size Tuning, problem_size = ' + str(problem_size))
        # ax[0].scatter(population_sizes_list, ga_population_tuning_time, c='r', marker='x', s=10)
        # ax[0].set(xlabel='Population Size', ylabel = 'Time')

        # ax[1].scatter(population_sizes_list, ga_population_tuning_fitness, c='g', marker='x', s=10)
        # ax[1].set(xlabel='Population Size', ylabel = 'Fitness')

        # ax[2].scatter(param_tuning_list, ga_population_tuning_feval, c='g', marker='o', s=10)
        # ax[2].set(xlabel='Population Size', ylabel = 'Function Evaluations')
        # ax[2].yaxis.tick_right()

        # plt.show()

        mimic_fitness_tuning_list = []
        mimic_param_tuning_list = []
        time_tuning_list = []
        mimic_feval_tuning_list = []
        asdf_list = []
        fdsa_list = []
        experiment_name = 'mimic_knapsack_tuning_size_' + str(problem_size)
        population_sizes_list = 100,
        # keep_percent_list=np.arange(0.05, 1.0, 0.05)
        # mimic = runners.MIMICRunner(problem=problem,
        #             experiment_name=experiment_name,
        #             output_directory='knapsack',
        #             seed=27,
        #             iteration_list=[100],
        #             population_sizes=population_sizes_list,
        #             keep_percent_list=keep_percent_list,
        #             max_attempts=5)

        # # the two data frames will contain the results
        # df_run_stats, df_run_curves = mimic.run()
        # print(df_run_curves.dtypes)
        # print(df_run_curves)
        # #df_run_curves['Temperature'] = pd.to_numeric(df_run_curves['Temperature'].astype(str).astype(float))
        # print(df_run_curves)
        # for percent in keep_percent_list:
        #     this_temp_df = df_run_curves.loc[df_run_curves['Keep Percent'] == percent]
        #     this_temp_df['Iteration'] = this_temp_df['Iteration'] - this_temp_df.loc[this_temp_df['Iteration'].idxmin()]['Iteration'] + 1
        #     mimic_fitness_tuning_list.append(this_temp_df.loc[this_temp_df['Fitness'].idxmax()]['Fitness'])
        #     mimic_param_tuning_list.append(percent)
        #     feval_tuning_list.append(population_sizes_list[0] * this_temp_df.loc[this_temp_df['Iteration'].idxmax()]['Iteration'])
        #     time_tuning_list.append(this_temp_df.loc[this_temp_df['Time'].idxmax()]['Time'])
        #     asdf_list.append(this_temp_df['Fitness'])
        #     fdsa_list.append(this_temp_df['Iteration'])

        # plt.rc("font", size=8)
        # plt.rc("axes", titlesize=12)
        # plt.rc("axes", labelsize=10)
        # plt.rc("xtick", labelsize=8)
        # plt.rc("ytick", labelsize=8)
        # plt.rc("legend", fontsize=8)
        # plt.rc("figure", titlesize=11)
        # #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
        # fig, ax = plt.subplots(1,3,figsize=(12,3.5))
        # fig.suptitle('MIMIC Keep Percent Tuning, problem_size = ' + str(problem_size))
        # ax[0].scatter(param_tuning_list, time_tuning_list, c='r', marker='x', s=10)
        # ax[0].set(xlabel='Keep Percent (decimal)', ylabel = 'Time (s)')

        # ax[1].scatter(param_tuning_list, fitness_tuning_list, c='g', marker='o', s=10)
        # ax[1].set(xlabel='Keep Percent (decimal)', ylabel = 'Fitness')

        # ax[2].scatter(param_tuning_list, feval_tuning_list, c='g', marker='o', s=10)
        # ax[2].set(xlabel='Keep Percent (decimal)', ylabel = 'Function Evaluations')
        # ax[2].yaxis.tick_right()

        # plt.show()

        # fig, ax = plt.subplots()
        # ax.scatter(fdsa_list[17], asdf_list[17])
        # ax.set(xlabel='Iteration', ylabel = 'Fitness')
        # plt.show()

        # Tune population size
        mimic_population_tuning_fitness = []
        mimic_population_tuning_time = []
        mimic_population_tuning_feval = []
        population_sizes_list = np.arange(10, 500, 10)
        for population_size in population_sizes_list:
            experiment_name = 'mimic_knapsack_tuning_population_size_' + str(
                problem_size)
            keep_percent_list = [0.45]
            mimic = runners.MIMICRunner(
                problem=problem,
                experiment_name=experiment_name,
                output_directory='knapsack',
                seed=27,
                iteration_list=[100],
                population_sizes=[int(population_size)],
                keep_percent_list=keep_percent_list,
                max_attempts=5,
                use_fast_mimic=True)

            # the two data frames will contain the results
            mimic_run_stats, mimic_run_curves = mimic.run()
            mimic_population_tuning_fitness.append(mimic_run_curves.loc[
                mimic_run_curves['Fitness'].idxmax()]['Fitness'])
            mimic_population_tuning_time.append(mimic_run_curves.loc[
                mimic_run_curves['Time'].idxmax()]['Time'])
            mimic_population_tuning_feval.append(
                population_size * mimic_run_curves.loc[
                    mimic_run_curves['Iteration'].idxmax()]['Iteration'])

        plt.rc("font", size=8)
        plt.rc("axes", titlesize=14)
        plt.rc("axes", labelsize=10)
        plt.rc("xtick", labelsize=8)
        plt.rc("ytick", labelsize=8)
        plt.rc("legend", fontsize=11)
        plt.rc("figure", titlesize=11)
        fig, ax = plt.subplots(2, 4, figsize=(12, 7))
        fig.suptitle('Knapsack Algorithm Tuning, problem size = ' +
                     str(problem_size))

        ax[0, 0].scatter(rhc_param_tuning_list,
                         rhc_fitness_tuning_list,
                         c='r',
                         marker='x',
                         s=10)
        ax[0, 0].set(xlabel='Restarts', ylabel='Fitness', title='RHC Restarts')

        ax[0, 1].scatter(sa_param_tuning_list,
                         sa_fitness_tuning_list,
                         c='g',
                         marker='o',
                         s=10)
        ax[0, 1].set(xlabel='Temperature', title='SA Temperature')

        ax[0, 2].scatter(population_sizes_list,
                         ga_population_tuning_fitness,
                         c='g',
                         marker='o',
                         s=10)
        ax[0, 2].set(xlabel='Population Size', title='GA Population Size')
        ax[0, 2].yaxis.tick_right()

        ax[0, 3].scatter(population_sizes_list,
                         mimic_population_tuning_fitness,
                         c='g',
                         marker='o',
                         s=10)
        ax[0, 3].set(xlabel='Population Size', title='MIMIC Population Size')
        ax[0, 3].yaxis.tick_right()

        ax[1, 0].scatter(rhc_param_tuning_list,
                         rhc_feval_tuning_list,
                         c='r',
                         marker='x',
                         s=10)
        ax[1, 0].set(xlabel='Restarts', ylabel='Function Evaluations')

        ax[1, 1].scatter(sa_param_tuning_list,
                         sa_feval_tuning_list,
                         c='g',
                         marker='o',
                         s=10)
        ax[1, 1].set(xlabel='Temperature')

        ax[1, 2].scatter(population_sizes_list,
                         ga_population_tuning_feval,
                         c='g',
                         marker='o',
                         s=10)
        ax[1, 2].set(xlabel='Population Size')
        ax[1, 2].yaxis.tick_right()

        ax[1, 3].scatter(population_sizes_list,
                         mimic_population_tuning_feval,
                         c='g',
                         marker='o',
                         s=10)
        ax[1, 3].set(xlabel='Population Size')
        ax[1, 3].yaxis.tick_right()

        plt.show()

    if 'complexity_graph' in task:
        problem_size_list = np.arange(5, 85, 5)
        sa_time_list = []
        sa_fitness_list = []
        sa_feval_list = []
        rhc_time_list = []
        rhc_fitness_list = []
        rhc_feval_list = []
        ga_time_list = []
        ga_fitness_list = []
        ga_feval_list = []
        mimic_time_list = []
        mimic_fitness_list = []
        mimic_feval_list = []
        for problem_size in problem_size_list:
            # Knapsack
            weights = [idx for idx in range(1, problem_size + 1)]
            print(weights)
            values = [idx for idx in range(1, problem_size + 1)]
            max_weight_pct = 0.3
            knapsack_fitness = mlrose.Knapsack(weights, values, max_weight_pct)
            best_fitness_list = []
            problem = mlrose.DiscreteOpt(int(problem_size),
                                         knapsack_fitness,
                                         maximize=True,
                                         max_val=2)

            # RHC
            experiment_name = 'rhc_knapsack_complexity_size_' + str(
                problem_size)
            restart_list = [100]
            rhc = runners.RHCRunner(problem=problem,
                                    experiment_name=experiment_name,
                                    output_directory='knapsack',
                                    seed=27,
                                    iteration_list=[5000],
                                    max_attempts=10,
                                    restart_list=restart_list)
            # the two data frames will contain the results
            rhc_run_stats, rhc_run_curves = rhc.run()
            rhc_time = rhc_run_curves['Time']
            rhc_fitness = rhc_run_curves['Fitness']
            rhc_iteration = rhc_run_curves['Iteration']
            rhc_fitness_list.append(rhc_run_curves.loc[
                rhc_run_curves['Fitness'].idxmax()]['Fitness'])
            rhc_time_list.append(
                rhc_run_curves.loc[rhc_run_curves['Time'].idxmax()]['Time'])
            rhc_feval_list.append(3 * rhc_run_curves.loc[
                rhc_run_curves['Iteration'].idxmax()]['Iteration'])

            # SA
            experiment_name = 'sa_knapsack_complexity_size_' + str(
                problem_size)
            temperature_list = [2]
            sa = runners.SARunner(problem=problem,
                                  experiment_name=experiment_name,
                                  output_directory='knapsack',
                                  seed=27,
                                  iteration_list=[10000],
                                  max_attempts=50,
                                  temperature_list=temperature_list)
            # the two data frames will contain the results
            sa_run_stats, sa_run_curves = sa.run()
            # print(sa_run_curves.dtypes)
            # print(sa_run_curves)
            sa_run_curves['Temperature'] = pd.to_numeric(
                sa_run_curves['Temperature'].astype(str).astype(float))
            # print(df_run_curves)
            sa_time = sa_run_curves['Time']
            sa_fitness = sa_run_curves['Fitness']
            sa_iteration = sa_run_curves['Iteration']
            sa_fitness_list.append(sa_run_curves.loc[
                sa_run_curves['Fitness'].idxmax()]['Fitness'])
            sa_time_list.append(
                sa_run_curves.loc[sa_run_curves['Time'].idxmax()]['Time'])
            sa_feval_list.append(2 * sa_run_curves.loc[
                sa_run_curves['Iteration'].idxmax()]['Iteration'])

            # GA
            experiment_name = 'ga_knapsack_complexity_size_' + str(
                problem_size)
            population_sizes_list = 100,
            mutation_rates_list = [0.15]
            ga = runners.GARunner(problem=problem,
                                  experiment_name=experiment_name,
                                  output_directory='knapsack',
                                  seed=27,
                                  iteration_list=[1000],
                                  population_sizes=population_sizes_list,
                                  mutation_rates=mutation_rates_list,
                                  max_attempts=100)
            # the two data frames will contain the results
            ga_run_stats, ga_run_curves = ga.run()
            # print(ga_run_curves.dtypes)
            # print(ga_run_curves)
            # print(df_run_curves)
            ga_time = ga_run_curves['Time']
            ga_fitness = ga_run_curves['Fitness']
            ga_iteration = ga_run_curves['Iteration']
            ga_fitness_list.append(ga_run_curves.loc[
                ga_run_curves['Fitness'].idxmax()]['Fitness'])
            ga_time_list.append(
                ga_run_curves.loc[ga_run_curves['Time'].idxmax()]['Time'])
            ga_feval_list.append(population_sizes_list[0] * ga_run_curves.loc[
                ga_run_curves['Iteration'].idxmax()]['Iteration'])

            # MIMC
            experiment_name = 'mimic_knapsack_complexity_size_' + str(
                problem_size)
            population_sizes_list = 200,
            keep_percent_list = [0.35]
            mimic = runners.MIMICRunner(problem=problem,
                                        experiment_name=experiment_name,
                                        output_directory='knapsack',
                                        seed=27,
                                        iteration_list=[150],
                                        population_sizes=population_sizes_list,
                                        keep_percent_list=keep_percent_list,
                                        max_attempts=15,
                                        use_fast_mimic=True)
            # the two data frames will contain the results
            mimic_run_stats, mimic_run_curves = mimic.run()
            # print(mimic_run_curves.dtypes)
            # print(mimic_run_curves)
            # print(df_run_curves)
            mimic_time = mimic_run_curves['Time']
            mimic_fitness = mimic_run_curves['Fitness']
            mimic_iteration = mimic_run_curves['Iteration']
            mimic_fitness_list.append(mimic_run_curves.loc[
                mimic_run_curves['Fitness'].idxmax()]['Fitness'])
            mimic_time_list.append(mimic_run_curves.loc[
                mimic_run_curves['Time'].idxmax()]['Time'])
            mimic_feval_list.append(
                population_sizes_list[0] * mimic_run_curves.loc[
                    mimic_run_curves['Iteration'].idxmax()]['Iteration'])

        plt.rc("font", size=8)
        plt.rc("axes", titlesize=12)
        plt.rc("axes", labelsize=10)
        plt.rc("xtick", labelsize=8)
        plt.rc("ytick", labelsize=8)
        plt.rc("legend", fontsize=8)
        plt.rc("figure", titlesize=11)
        #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
        fig, ax = plt.subplots(1, 3, figsize=(12, 3.5))
        fig.suptitle('Knapsack Complexity Analysis', fontsize=14)
        # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1)
        # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1)
        w = 1
        ax[0].bar(problem_size_list - w,
                  sa_fitness_list,
                  width=w,
                  color='blue',
                  label='Simulated Annealing')
        ax[0].bar(problem_size_list,
                  ga_fitness_list,
                  width=w,
                  color='green',
                  label='Genetic')
        ax[0].bar(problem_size_list - 2 * w,
                  rhc_fitness_list,
                  width=w,
                  color='red',
                  label='Random Hill Climb')
        ax[0].bar(problem_size_list + w,
                  mimic_fitness_list,
                  width=w,
                  color='orange',
                  label='MIMIC')
        ax[0].set(xlabel='Knapsack Size', ylabel='Fitness')
        ax[0].legend()

        ax[1].plot(problem_size_list,
                   sa_time_list,
                   'b-',
                   label='Simulated Annealing',
                   linewidth=1)
        ax[1].plot(problem_size_list,
                   ga_time_list,
                   'g:',
                   label='Genetic',
                   linewidth=1)
        ax[1].plot(problem_size_list,
                   rhc_time_list,
                   'r--',
                   label='Random Hill Climb',
                   linewidth=1)
        ax[1].plot(problem_size_list,
                   mimic_time_list,
                   '-.',
                   color='orange',
                   label='MIMIC',
                   linewidth=1)
        ax[1].set(xlabel='Knapsack Size', ylabel='Time (s)')
        ax[1].legend()

        ax[2].plot(problem_size_list,
                   sa_feval_list,
                   'b-',
                   label='Simulated Annealing',
                   linewidth=1)
        ax[2].plot(problem_size_list,
                   ga_feval_list,
                   'g:',
                   label='Genetic',
                   linewidth=1)
        ax[2].plot(problem_size_list,
                   rhc_feval_list,
                   'r--',
                   label='Random Hill Climb',
                   linewidth=1)
        ax[2].plot(problem_size_list,
                   mimic_feval_list,
                   '-.',
                   color='orange',
                   label='MIMIC',
                   linewidth=1)
        ax[2].set(xlabel='Knapsack Size', ylabel='Function Evaluations')
        ax[2].yaxis.tick_right()
        plt.show()

    if 'performance_graph' in task:
        problem_size = 80

        # Knapsack
        weights = [idx for idx in range(1, problem_size + 1)]
        print(weights)
        values = [idx for idx in range(1, problem_size + 1)]
        max_weight_pct = 0.3
        knapsack_fitness = mlrose.Knapsack(weights, values, max_weight_pct)
        best_fitness_list = []
        problem = mlrose.DiscreteOpt(int(problem_size),
                                     knapsack_fitness,
                                     maximize=True,
                                     max_val=2)

        # RHC
        experiment_name = 'rhc_knapsack_performance_size_' + str(problem_size)
        restart_list = [100]
        rhc = runners.RHCRunner(problem=problem,
                                experiment_name=experiment_name,
                                output_directory='knapsack',
                                seed=27,
                                iteration_list=[5000],
                                max_attempts=10,
                                restart_list=restart_list)
        # the two data frames will contain the results
        rhc_run_stats, rhc_run_curves = rhc.run()
        # print(rhc_run_curves.dtypes)
        # print(rhc_run_curves)
        # print(df_run_curves)
        rhc_time = rhc_run_curves['Time']
        rhc_fitness = rhc_run_curves['Fitness']
        rhc_iteration = rhc_run_curves['Iteration']
        rhc_feval = rhc_run_curves['Iteration'] * 2

        # SA
        experiment_name = 'sa_knapsack_performance_size_' + str(problem_size)
        temperature_list = [2]
        sa = runners.SARunner(problem=problem,
                              experiment_name=experiment_name,
                              output_directory='knapsack',
                              seed=27,
                              iteration_list=[10000],
                              max_attempts=50,
                              temperature_list=temperature_list)
        # the two data frames will contain the results
        sa_run_stats, sa_run_curves = sa.run()
        # print(sa_run_curves.dtypes)
        # print(sa_run_curves)
        sa_run_curves['Temperature'] = pd.to_numeric(
            sa_run_curves['Temperature'].astype(str).astype(float))
        # print(df_run_curves)
        sa_time = sa_run_curves['Time']
        sa_fitness = sa_run_curves['Fitness']
        sa_iteration = sa_run_curves['Iteration']
        sa_feval = sa_run_curves['Iteration'] * 2

        # GA
        experiment_name = 'ga_knapsack_performance_size_' + str(problem_size)
        population_sizes_list = 100,
        mutation_rates_list = [0.15]
        ga = runners.GARunner(problem=problem,
                              experiment_name=experiment_name,
                              output_directory='knapsack',
                              seed=27,
                              iteration_list=[1000],
                              population_sizes=population_sizes_list,
                              mutation_rates=mutation_rates_list,
                              max_attempts=100)
        # the two data frames will contain the results
        ga_run_stats, ga_run_curves = ga.run()
        # print(ga_run_curves.dtypes)
        # print(ga_run_curves)
        # print(df_run_curves)
        ga_time = ga_run_curves['Time']
        ga_fitness = ga_run_curves['Fitness']
        ga_iteration = ga_run_curves['Iteration']
        ga_feval = ga_run_curves['Iteration'] * population_sizes_list

        # MIMC
        experiment_name = 'mimic_knapsack_performance_size_' + str(
            problem_size)
        population_sizes_list = 200,
        keep_percent_list = [0.5]
        mimic = runners.MIMICRunner(problem=problem,
                                    experiment_name=experiment_name,
                                    output_directory='knapsack',
                                    seed=27,
                                    iteration_list=[150],
                                    population_sizes=population_sizes_list,
                                    keep_percent_list=keep_percent_list,
                                    max_attempts=15,
                                    use_fast_mimic=True)
        # the two data frames will contain the results
        mimic_run_stats, mimic_run_curves = mimic.run()
        # print(mimic_run_curves.dtypes)
        # print(mimic_run_curves)
        # print(df_run_curves)
        mimic_time = mimic_run_curves['Time']
        mimic_fitness = mimic_run_curves['Fitness']
        mimic_iteration = mimic_run_curves['Iteration']
        mimic_feval = mimic_run_curves['Iteration'] * population_sizes_list

        plt.rc("font", size=8)
        plt.rc("axes", titlesize=12)
        plt.rc("axes", labelsize=10)
        plt.rc("xtick", labelsize=8)
        plt.rc("ytick", labelsize=8)
        plt.rc("legend", fontsize=8)
        plt.rc("figure", titlesize=11)
        #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
        fig, ax = plt.subplots(1, 3, figsize=(12, 3.5))
        fig.suptitle(
            'Knapsack Algorithm Performance Analysis, problem size = ' +
            str(problem_size),
            fontsize=14)
        # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1)
        # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1)
        w = 1
        ax[0].plot(rhc_iteration,
                   rhc_fitness,
                   'r--',
                   label='Random Hill Climb',
                   linewidth=1)
        ax[0].plot(sa_iteration,
                   sa_fitness,
                   'b:',
                   label='Simulated Annealing',
                   linewidth=1)
        ax[0].plot(ga_iteration,
                   ga_fitness,
                   'g-',
                   label='Genetic',
                   linewidth=2)
        ax[0].plot(mimic_iteration,
                   mimic_fitness,
                   '-.',
                   color='orange',
                   label='MIMIC',
                   linewidth=2)
        ax[0].set(xlabel='Iteration', ylabel='Fitness')
        ax[0].legend()
        #ax[0].set_title('Fitness vs. Iteration')

        ax[1].plot(rhc_time,
                   rhc_fitness,
                   'r--',
                   label='Random Hill Climb',
                   linewidth=1)
        ax[1].plot(sa_time,
                   sa_fitness,
                   'b:',
                   label='Simulated Annealing',
                   linewidth=1)
        ax[1].plot(ga_time, ga_fitness, 'g-', label='Genetic', linewidth=2)
        ax[1].plot(mimic_time,
                   mimic_fitness,
                   '-.',
                   color='orange',
                   label='MIMIC',
                   linewidth=2)
        ax[1].set(xlabel='Time (s)', ylabel='Fitness')
        ax[1].legend()

        ax[2].plot(rhc_feval,
                   rhc_fitness,
                   'r--',
                   label='Random Hill Climb',
                   linewidth=1)
        ax[2].plot(sa_feval,
                   sa_fitness,
                   'b:',
                   label='Simulated Annealing',
                   linewidth=1)
        ax[2].plot(ga_feval, ga_fitness, 'g-', label='Genetic', linewidth=1)
        ax[2].plot(mimic_feval,
                   mimic_fitness,
                   '-.',
                   color='orange',
                   label='MIMIC',
                   linewidth=1)
        ax[2].set(xlabel='Function Evaluations')
        plt.show()

    return
Example #57
0
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 26 18:17:30 2015
@author: ldierker
"""
import pandas
import numpy
import scipy.stats
import seaborn
import statsmodels
import matplotlib.pyplot as plt
data = pandas.read_csv('nesarc.txt', low_memory=False)
""" setting variables you will be working with to numeric
10/29/15 note that the code is different from what you see in the videos
 A new version of pandas was released that is phasing out the convert_objects(convert_numeric=True)
It still works for now, but it is recommended that the pandas.to_numeric function be
used instead """
""" old code:
data['TAB12MDX'] = data['TAB12MDX'].convert_objects(convert_numeric=True)
data['CHECK321'] = data['CHECK321'].convert_objects(convert_numeric=True)
data['S3AQ3B1'] = data['S3AQ3B1'].convert_objects(convert_numeric=True)
data['S3AQ3C1'] = data['S3AQ3C1'].convert_objects(convert_numeric=True)
data['AGE'] = data['AGE'].convert_objects(convert_numeric=True) """
# new code setting variables you will be working with to numeric
data['TAB12MDX'] = pandas.to_numeric(data['TAB12MDX'], errors='coerce')
data['CHECK321'] = pandas.to_numeric(data['CHECK321'], errors='coerce')
data['S3AQ3B1'] = pandas.to_numeric(data['S3AQ3B1'], errors='coerce')
data['S3AQ3C1'] = pandas.to_numeric(data['S3AQ3C1'], errors='coerce')
data['AGE'] = pandas.to_numeric(data['AGE'], errors='coerce')
#subset data to young adults age 18 to 25 who have smoked in the past 12 months
active_users_longer_intervals=[]

active_devs_sleeping_intervals_df = []
active_devs_hibernation_intervals_df = []
active_devs_dead_intervals_df = []

n=0
for index, row in active_users_breaks.iterrows():
    user_id=row['durations'][0]
    
    last_commit_day=util.getLastCommitDay(commit_table, user_id)
    last_break_length=util.days_between(last_commit_day, project_end)
    last_break_interval=last_commit_day+'/'+project_end
    
    row['durations'] = pandas.to_numeric(row['durations'][1:-2],'raise','integer').tolist()
    row['durations'].append(last_break_length)
    row['datelimits'] = row['datelimits'][1:]
    row['datelimits'].append(last_break_interval)
    
    user_actions = ae.get_user_activities(super_path, g, project_start_dt, project_end, user_id)
                    
    ### Here the NORMAL execution goes on
    longer_breaks = pandas.DataFrame(columns=['durations', 'datelimits'])
    current_user_hibernation_periods_df = pandas.DataFrame(columns=['durations', 'datelimits'])
    current_user_sleepy_periods_df = pandas.DataFrame(columns=['durations', 'datelimits'])
    current_user_dead_periods_df=pandas.DataFrame(columns=['durations', 'datelimits'])
    dead_th = cfg.dead_threshold

    current_sleepy_periods_details=[]
    SLIDE_WIN_SIZE = 20
Example #59
0
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the new condition variables as
            described in the class documentation.  Also adds the following
            new statistics:
            
            - **mean** : Float
                the mean of the fitted gaussian in each channel for each component.
                
            - **sigma** : (Float, Float)
                the locations the mean +/- one standard deviation in each channel
                for each component.
                
            - **correlation** : Float
                the correlation coefficient between each pair of channels for each
                component.
                
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                added if :attr:`num_components` ``> 1``.
        """
             
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
         
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")
         
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
        
        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if self.sigma > 0:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))
 
        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))               
         
        if not self._gmms:
            raise util.CytoflowOpError(None, 
                                       "No components found.  Did you forget to "
                                       "call estimate()?")
            
        for c in self.channels:
            if c not in self._scale:
                raise util.CytoflowOpError(None,
                                           "Model scale not set.  Did you forget "
                                           "to call estimate()?")
 
        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                           .format(c))
        
        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
#                             
#         if self.num_components == 1 and self.sigma == 0.0:
#             raise util.CytoflowOpError('sigma',
#                                        "if num_components is 1, sigma must be > 0.0")
        
                
        if self.num_components == 1 and self.posteriors:
            warn("If num_components == 1, all posteriors will be 1",
                 util.CytoflowOpWarning)
#             raise util.CytoflowOpError('posteriors',
#                                        "If num_components == 1, all posteriors will be 1.")
         
        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object")
 
        if self.sigma > 0:
            event_gate = {i : pd.Series([False] * len(experiment), dtype = "double")
                           for i in range(self.num_components)}
 
        if self.posteriors:
            event_posteriors = {i : pd.Series([0.0] * len(experiment), dtype = "double")
                                for i in range(self.num_components)}

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)   

        # make the statistics       
        components = [x + 1 for x in range(self.num_components)]
         
        prop_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components], 
                                         names = list(self.by) + ["Component"])
        prop_stat = pd.Series(name = "{} : {}".format(self.name, "proportion"),
                              index = prop_idx, 
                              dtype = np.dtype(object)).sort_index()
                  
        mean_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(name = "{} : {}".format(self.name, "mean"),
                              index = mean_idx, 
                              dtype = np.dtype(object)).sort_index()
        sigma_stat = pd.Series(name = "{} : {}".format(self.name, "sigma"),
                               index = mean_idx,
                               dtype = np.dtype(object)).sort_index()
        interval_stat = pd.Series(name = "{} : {}".format(self.name, "interval"),
                                  index = mean_idx, 
                                  dtype = np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"])
        corr_stat = pd.Series(name = "{} : {}".format(self.name, "correlation"),
                              index = corr_idx, 
                              dtype = np.dtype(object)).sort_index()  
                 
        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue
             
            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
                
            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
                        
            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]
 
            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])
                
                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx
     
                event_assignments.iloc[group_idx] = predicted_str
                
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]
                    
                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate 
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed
                    
                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)
                    
                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)
                    
            if self.posteriors:
#                 import sys;sys.path.append(r'/home/brian/.p2/pool/plugins/org.python.pydev_6.2.0.201711281614/pysrc')
#                 import pydevd;pydevd.settrace()
                
                p = gmm.predict_proba(x)
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[:, c]
                    
            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.loc[g] = gmm.weights_[c]
                
                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.loc[g2] = self._scale[channel1].inverse(gmm.means_[c, cidx1])
                    
                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1]))
                    interval_stat.loc[g2] = (self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]),
                                             self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1]))
            
                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]
                        
                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace = True)

        new_experiment = experiment.clone()
          
        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.sigma > 0:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])              
                
        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double", event_posteriors[c])
                
        new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        new_experiment.statistics[(self.name, "interval")] = interval_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
 def load_data():
     data = pd.read_csv("cell_samples.csv")
     data = data[pd.to_numeric(data['BareNuc'], errors='coerce').notnull()]
     data['BareNuc'] = data['BareNuc'].astype('int')
     return data