Example #1
0
def train_predict(all_df, features, prod_features, str_date, cv):
    
    # all_df, 통합 데이터
    # features, 학습에 사용할 변수
    # prod_features, 24개 금융 변수
    # str_date, 예측 결과물을 산출하는 날짜.
    #          2016-05-28일 경우, 훈련 데이터의 일부이며 정답을 알고 있기에 교차 검증을 의미
    #          2016-06-28일 경우, 캐글에 업로드하기 위한 테스트 데이터 예측 결과물을 생성한다.
    # cv, 교차 검증 실행 여부
    
    # str_date로 예측 결과물을 산출하는 날짜 지정
    test_date = date_to_int(str_date)
    
    # 훈련 데이터는 test_date 이전의 모든 데이터를 사용한다.
    train_df = all_df[all_df.int_date < test_date]
    # 테스트 데이터를 통합 데이터에서 분리한다.
    test_df = pd.DateFrame(all_df[all_df.int_date == test_date])
    
    # 신규 구매 고객만을 훈련 데이터로 추출한다.
    X = []
    Y = []
    for i,prod in enumerate(products):
        prev = prod + '_prev1'
        # 신규 구매 고객을 prX에 저장한다.
        prX = train_df[(train_df[prod] == 1) & (train_df[prev] == 0)]
        # prY에는 신규 구매에 대한 label 값을 저장한다.
        prY = np.zeros(prX.shape[0], dtype=np.int8) + i
        X.append(prX)
        Y.append(prY)
        
    XY = pd.concat(X)
    y = np.hstack(Y)
    # XY는 신규 구매 데이터만 포함한다.
    XY['y'] = Y
    
    # 메모리에서 변수 삭제
    del train_df; del all_df
    
    # 데이터별 가중치 계산하기 위해 새로운 변수(ncodpers + fecha_dato)를 생성한다.
    XY['ncodepers_fecha_dato'] = XY['ncodpers'].astype(str) + XY['fecha_dato']
    uniqs, counts = np.unique(XY['ncodepers_fecha_dato'], reture_counts=True)
    # 자연 상수(e)를 통해, count가 높은 데이터에 낮은 가중치를 준다.
    weights = np.exp(1/counts - 1)
    
    # 가중치를 XY 데이터에 추가한다.
    wdf = pd.DateFrame()
    wdf['ncodepers_fecha_dato'] = uniqs
    wdf['counts'] = counts
    wdf['weights'] = weights
    XY = XY.merge(wdf, on='ncodepers_fecha_dato')
Example #2
0
 def __init__(self, leverage, fee, test_df, cols_features, hmm_model,
              long_states, random_states, short_states, **kwargs):
     super(HmmStrategy, self).__init__(
         leverage=leverage,
         fee=fee,
         test_df=test_df,
         cols_features=cols_features,
         hmm_model=hmm_model,
         long_states=long_states,
         random_states=random_states,
     )
     self.short_states = short_states
     self.ret = pd.DateFrame()
     self.signal_state_all = pd.DateFrame()
Example #3
0
def f主机表(a设备):
    """适用于: 普联wdr5620"""
    def fe主机():
        a设备.f切换模式(模式.C模式wdr5620.c设备管理)
        i = 0
        while True:
            va元素 = list(
                self.m设备.fe查找("//*[@id='eptMngList']/div[@class='eptConC']"))
            if i >= len(va元素):
                break
            v元素 = va元素[i]
            w管理 = v元素.f查找("div/div/input[1]")
            w管理.f点击()
            w详细 = self.m设备.f查找("//*[@id='eptMngDetail']")
            w名称 = w详细.f查找("div/p/span/pre")
            v名称 = w名称.fg文本()
            w标题 = w详细.f查找("div/span")
            v网络地址s, v物理地址s, v连接方式s = w标题.fg文本().split("|")
            v物理地址 = 地址.S物理地址.fc字符串(v物理地址s.strip()[4:])
            yield {数据表.E字段.e对端名称: v名称, 数据表.E字段.e对端物理地址: v物理地址}
            #结束
            w主人网络 = self.m设备.f查找("//*[@id='linkedEpt_rsMenu']")
            w主人网络.f点击()
            i += 1

    return pandas.DateFrame(fe主机(a设备))
Example #4
0
def read_file_to_pdf(file, file_type):
    try:
        pdf = file_type_read_functions[file_type](file)
    except Exception as e:
        logger.error(f'Error reading file to pandas: {str(e)}')
        return pd.DateFrame()
    return pdf
    def pca_map(self, umap):
        if self.area.isEmpty():
            self.__city_and_province()

        else:
            if (self.area.name not in SuperMap.rep_areas) or (umap.get(self.area.name)):
                if umap.get(self.area.name):
                    temp = ump.get(self.area.name)
                else:
                    temp = SuperMap.area_city_mapper.get(self.area.name)
                if self.city.isEmpty() and self.city.precision == 1:
                    if not self.area.isBelong(self.city.name) and umap.get(self.area.name) != self.city.name:
                        self.area.reset()
                self.__city_and_province()

            else:
                impot logging
                SuperMap.rep_area_set.add(self.area.name)
                if self.city.isNotEmpty():
                    self.__city_and_province()

        if self.city.name.isdegit():
            self.city.reset()

        import pandas as pd

        return pd.DateFrame({'Province':[self.province.name], 'City':[self.city.name], 'Area':[self.area.name]})
Example #6
0
    def on_bar(self, bar):
        # so this function will get called on the streaming triggers
        # we will handle our state in here and submit trades based on that
        self._bars = self._bars.append(
            pd.DateFrame(
                {
                    'open': bar.open,
                    'high': bar.high,
                    'low': bar.low,
                    'close': bar.close,
                    'volume': bar.volume,
                },
                index=[bar.start]))

        bar_len = len(self._bars)

        self._logger.info(
            f'received bar. start = {bar.start}, close = {bar.close}, len = {bar_len}'
        )
        if bar_len < 21:
            return
        if self._outOfMarket():
            return
        if self._state == 'TO_BUY':
            if self._calc_buy_signal():
                self._submit_buy()
Example #7
0
 def plot_age_curve_params(fit, output_dir):
     df = pd.DateFrame(fit.extract(permuted=True)["beta_age_curve"], 
                       columns=["Race_Num_Adj_%d" % i for i in range(3)])
     fig = sns.pairplot(df, vars=list(df.columns), 
                        diag_kind="kde", plot_kws={"alpha": 0.1})
     fn = os.path.join(output_dir, "age_curve_args.png")
     print "writing < %s >" % fn
     fig.savefig(fn, bbox_inches="tight")
Example #8
0
def save_learner_multiple_run(names_list, master_dict, output_location):
    num = len(names_list)
    mean_v_df = pd.DateFrame({'Iteration'})
    for i in num:
        run_stats_df = master_dict['run_stats_df']
        policy_df = master_dict['policy_df']
        key_metrics = pd.DataFrame(master_dict['key_metrics_dict'])
    return
    def __init__(self, data, target, classifier, filename=None):
        if data is not None:
            self.data = data
        elif filename[:3] == 'csv':
            self.data = pd.read_csv(filename)
        elif filename[:3] == 'txt':
            self.data = pd.DateFrame(filename)
        else:
            print("Data Invalid")

        self.target = target

        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.data, self.target, test_size=0.7, train_size=0.3)
        self.classifier = classifier
Example #10
0
 def _save_excel(self, new_good_list):
     if os.path.exists(GOODS_EXCEL_PATH):
         df = pd.read_excel(GOODS_EXCEL_PATH)
         df = df.append(new_good_list)
     else:
         df = pd.DateFrame(new_good_list)
     writer = pd.ExcelFile(GOODS_EXCEL_PATH)
     df.to_excel(
         excel_writer=writer,
         columns=['title', 'price', 'location', 'sales', 'comment_url'],
         index=False,
         encoding='utf-8',
         sheet_name='Sheet')
     writer.save()
     writer.close()
Example #11
0
def load_db():
    try:
        db = pd.read_pickle("DB/DB.pkl")
    except FileNotFoundError:
        db = pd.DateFrame()

    try:
        sdb = pd.read_pickle("./DB/item/items.pkl")
    except FileNotFoundError:
        db = pd.DataFrame()

    try:
        nlp_log = pd.read_pickle('nlp_log.pkl')
    except:
        nlp_log = pd.DataFrame()
    return db, sdb, nlp_log
Example #12
0
def data_extraction(file_list, main_file):
    result = pd.read_csv('./' + main_file)
    df = pd.DateFrame()     # file_list = [[name, type], []]
    for f, site_type in file_list:
        if site_type == 'yogiyo':
            df = yogiyo_data_extraction(f)
        elif site_type == 'mangoplate':
            df = mangoplate_data_extraction(f)
        elif site_type == 'tripadvisor':
            df = tripadvisor_date_extraction(f)
        elif site_type == 'diningcode':
            df = diningcode_date_extraction(f)
        elif site_type == 'menupan':
            df = menupan_data_extraction(f)
        result = pd.concat([result, df])
    
    result.to_csv('total_review.csv')
    merge_same_restaurant('total_review.csv')
Example #13
0
    def read_data(self, path, flag):
        samples = pd.DataFrame(columns=['基站名称', '告警数量', '曾经退服', 'label'])
        delta_time = np.timedelta64(1, 'D')

        for csvs in tqdm(os.listdir(path)):

            # 先可以每个基站分开做。之后再考虑是否可以合起来利用
            data = pd.read_csv(os.path.join(path, csvs))
            n_rows = len(data)
            if n_rows <= 1:
                continue

            data = pd.to_datetime(data['告警开始时间'], format='%Y-%m-%d %H:%M:%S')

            # 有几条告警,按比例出样本
            n_samples = int(n_rows * self.sample_rate)
            for n in range(n_samples):
                rand = random.randint(1, len(data) - 1)
                pre = rand - 1

                n_warnings = 0
                was_out_service = 0

                while pre >= 0 and (data['告警开始时间'][rand] - data['告警开始时间'][pre]
                                    <= delta_time):
                    if data['告警名称'][pre] in ['网元连接中断', '小区不可用告警']:
                        was_out_service = 1
                    n_warnings += 1
                    pre -= 1

                if data['告警名称'][rand] in ['网元连接中断', '小区不可用告警']:
                    label = 1
                else:
                    label = 0

                samples.append(pd.DateFrame(
                    [
                        data['告警开始时间'][rand]['基站名称'], n_warnings,
                        was_out_service, label
                    ],
                    columns=['基站名称', '告警数量', '曾经退服', 'label']),
                               ignore_index=True)

        return samples
Example #14
0
    def _save_events(self):
        """
        Write all events in event_db to calendar_db.csv.
        """
        out_data = []
        out_cols = ['NAME', 'DATE', 'TIME', 'DURATION', 'LOCATION', 'DESCRIPTION']
        for year in event_db.keys():
            for month in event_db[year].keys():
                for day in event_db[year + '.' + month].keys():
                    for event in event_db[year + '.' + month + '.' + day]:
                        out_data.append([event.name,
                                         event.date,
                                         event.time,
                                         event.duration,
                                         event.location,
                                         event.description])

        out_df = pd.DateFrame(data=out_data, columns=out_cols)
        out_df.to_csv('calendar_db.csv')

        return
Example #15
0
    def aggregate(self):
        '''
        Returns a new DataFrame where equal symbols are grouped into one record,
        and mean/median/mode/total bid/ask price is provided as well as the same
        statistics for volume (bid/ask size). IQR is reported as a tuple as well.
        Min(high), max(low), max-min/min, open-close/close, total outstanding.
        Should this basically be a .featurize() for all stocks?
        '''
        assert self.process

        aggr_df = pd.DateFrame()

        self.stocks = self.df['Symbol_Root'].unique()
        for stock in self.stocks:
            # Do aggregation
            stock_rows = self.df.loc[self.df['Symbol_Root'] == stock]

            # continue

        # Some columns may be irrelevant depending on type of file used
        return None
def random_stock_data(environ, asset_db_writer, minute_bar_writer,
                      daily_bar_writer, adjustment_writer, calendar,
                      start_session, end_session, cache, show_progress,
                      output_dir):
    # Get list of files from path
    # Slicing off teh last part
    # 'example.csv'[:-4] = 'example'
    symbols = [f[:-4] for f in listdir(path)]

    if not symbols:
        raise ValueError("No symbols foound in the folder")

    # Prepare an empty DataFrame for dividends
    divs = pd.DataFrame(columns=[
        'sid', 'amount', 'ex_date', 'record_date', 'declared_date', 'pay_date'
    ])

    # Prepare an empty DataFrame for splits
    splits = pd.DateFrame(columns=['sid', 'ratio', 'effective_date'])

    # Prepare an empty DataFrame for metadata
    metadata = pd.DataFrame(columns=[
        'start_date', 'end_date', 'auto_close_date', 'symbol', 'exchange'
    ])

    # Check valid trading dates, according to the selected exchange calendar
    sessions = calendar.sessions_in_range(start_session, end_session)

    # Get data for all stocks and wrtite to Zipline
    daily_bar_writer.write(process_stocks(symbols, sessions, metadata, divs))

    # Write the metadata
    asset_db_writer.write(equities=metadata)

    # Write splits and dividends
    adjustments_writer.write(splits=splits, dividends=divs)
    """Generator function to iterate stocks, 
Example #17
0
def test(model, test_loader, classnum=5, cvmodeoutput=False):

    model.eval()

    test_loss = 0
    correct = 0
    total = 0

    target_num = torch.zeros((1, classnum))
    predict_num = torch.zeros((1, classnum))
    acc_num = torch.zeros((1, classnum))

    for data, target in test_loader:
        with torch.no_grad():
            data, target = Variable(data), Variable(target)

        data = data.cuda()
        target = target.cuda()

        output = model(data)

        # calculate the sum of loss for testset
        test_loss += F.nll_loss(output, target).data.item()

        # max means the prediction
        pred = output.data.max(1, keepdim=True)[1]

        correct += pred.eq(target.data.view_as(pred)).sum()

        _, predicted = torch.max(output.data, 1)

        pre_mask = torch.zeros(output.size()).scatter_(1,
                                                       predicted.view(-1,
                                                                      1), 1.)
        predict_num += pre_mask.sum(0)
        tar_mask = torch.zeros(output.size()).scatter_(1,
                                                       target.data.view(-1, 1),
                                                       1.)
        target_num += tar_mask.sum(0)
        acc_mask = pre_mask * tar_mask
        acc_num += acc_mask.sum(0)

    if not cvmodeoutput:
        recall = acc_num / target_num
        precision = acc_num / predict_num
        F1 = 2 * recall * precision / (recall + precision)
        accuracy = acc_num.sum(1) / target_num.sum(1)

        recall = (recall.cpu().numpy()[0] * 100).round(3)
        precision = (precision.cpu().numpy()[0] * 100).round(3)
        F1 = (F1.cpu().numpy()[0] * 100).round(3)
        accuracy = (accuracy.cpu().numpy()[0] * 100).round(3)

        print('recall', " ".join('%s' % id for id in recall))
        print('precision', " ".join('%s' % id for id in precision))
        print('F1', " ".join('%s' % id for id in F1))
        print('accuracy', accuracy)

        test_loss /= len(test_loader.dataset)
        # the output is like Test set: Average loss: 0.0163, Accuracy: 6698/10000 (67%)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.
              format(test_loss, correct, len(test_loader.dataset),
                     100. * correct / len(test_loader.dataset)))

        # record the correct predict type
        pred = pred.cpu().numpy()
        pred = pd.DateFrame(pred)
        target.data.cpu().numpy()
        target = pd.DataFrame(target)
        correctpred = pred.loc[pred == target]
        print(correctpred.count())

    return 100. * correct / len(test_loader.dataset)
import pandas as pd
a = range(10)
b = [item * 2 for item in a]
data = pd.DateFrame({"idx": a, "value": b})
data.to_csv("result.csv", sep=",")
Example #19
0
df = pd.read_csv("../raw/quaterfinal_gy_cmp_training_traveltime.txt",
                 delimiter=";",
                 dtype={"link_ID": object})

# 探索性数据分析(EDA)

# 特征变换
df["travel_time"] = np.log1p(df["travel_time"])


# 数据平滑
def quantile_clip(group):
    # group.plot()
    group[group < group.quantile(0.05)] = group.quantile(0.05)
    group[group > group.quantile(0.95)] = group.quantile(0.95)
    # group.plot()
    plt.show()
    return


df["travel_time"] = df.groupby(["link_ID", "date"
                                ])["travel_time"].transform(quantile_clip)

# 缺失值补全
date_range = pd.date_range("2016-07-01 00:00:00", "2016-07-31 00:00:00", freq = "2min") \
 .append(pd.date_range("2017-04-01 00:00:00", "2016-07-31 00:00:00", freq = "2min"))
new_index = pd.MultiIndex.from_product(
    [link_df["link_ID"].unique(), date_range],
    names=["link_ID", "time_interval_begin"])
df1 = pd.DateFrame(index=new_index).reset_index()
df3 = pd.merge(df1, df, on=["link_ID", "time_interval_begin"], how="left")
Example #20
0
import pandas as pd

hdt = pd.read_csv('DeloitteWeekly_v3_LJS_04182021.csv')
yolo_hdt = hdt[hdt['County']=='Yolo']
yolo_hdt
pd.PeriodIndex(yolo_hdt['ResultDate'], freq='W')
yolo_hdt['week'] = pd.PeriodIndex(yolo_hdt['ResultDate'], freq='W')
yolo_hdt.query('Result=="Detected"').groupby('week').sum()
yolo_hdt.query('Result=="Detected"').groupby('week').count()
yolo_hdt.query('Result=="Detected"').groupby('week').count()['Row']
pos = yolo_hdt.query('Result=="Detected"').groupby('week').count()['Row']
neg = yolo_hdt.query('Result=="Not Detected"').groupby('week').count()['Row']
pos
neg
pos.join(neg)
pd.DateFrame({'pos result':pos, 'neg result':neg})
pd.DataFrame({'pos result':pos, 'neg result':neg})
pd.DataFrame({'pos result':pos, 'neg result':neg})['2020-12-1':'2021-2-1']
pd.DataFrame({'pos result':pos, 'neg result':neg})['2020-12-1':'2021-2-1'].to_csv('dec_jan_hdt.csv')
cdph = pd.read_csv('CDPH_testing_data_4_24.xlsx')
import pandas as pd
cdph = pd.read_csv('CDPH_testing_data_4_24.xlsx')
cdph = pd.read_excel('CDPH_testing_data_4_24.xlsx')
cdph
cdph['lab_result_date]
yolo = cdph['county'].str.lower() == 'yolo'
yolo = cdph[cdph['county'].str.lower() == 'yolo']
yolo
yolo.query('lab_result_date == "2020-12-15"')
yolo['week'] = pd.PeriodIndex(yolo['lab_result_date'], freq='W')
yolo.groupby('week').sum()
Example #21
0
def targetstock(means, data, targetstock, purdic):  #每日目标股票
    if means == '1':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] >
                                             0) & (data.iloc[i, [7]][0] > 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '2':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] >
                                             0) & (data.iloc[i, [7]][0] > 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '3':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] <
                                             0) & (data.iloc[i, [7]][0] > 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '4':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] >
                                             0) & (data.iloc[i, [7]][0] < 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '5':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] <
                                             0) & (data.iloc[i, [7]][0] < 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '6':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] >
                                             0) & (data.iloc[i, [7]][0] < 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '7':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] <
                                             0) & (data.iloc[i, [7]][0] > 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '8':
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] <
                                             0) & (data.iloc[i, [7]][0] < 0):
                targetstock = targetstock.append(df.iloc[i])

    if means == '9':  #5+6
        tem = pd.DateFrame()
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] < 0) & (
                    data.iloc[i, [7]][0] < 0):  #5
                tem.append(df.iloc[i])
            if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] > 0) & (
                    data.iloc[i, [7]][0] < 0):  #6
                targetstock = targetstock.append(df.iloc[i])
        tem.sort_values(by=['dif'], ascending=False)

        for i in range(min(len(tem), 10)):
            targetstock = targetstock.append(tem.iloc[i])

    if means == '0':  #4+6
        tem = pd.DataFrame()
        for i in range(len(data)):
            if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] > 0) & (
                    data.iloc[i, [7]][0] < 0):  #4
                tem = tem.append(df.iloc[i])
            if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] > 0) & (
                    data.iloc[i, [7]][0] < 0):  #6
                targetstock = targetstock.append(df.iloc[i])
        '''
        if len(targetstock) < 8:
            num = 8 - len(targetstock)
            if len(tem)!=0 :
                tem.sort_values(by = 'dif',ascending = True)
                for i in range(min(len(tem),num)):
                    targetstock = targetstock.append(tem.iloc[i])
        '''

        for i in range(len(tem)):
            targetstock = targetstock.append(tem.iloc[i])

    return targetstock
Example #22
0
    def train(self):

        #====================================== Training ===========================================#
        #===========================================================================================#

        unet_path = os.path.join(
            self.model_path, '%s-%d-%.4f-%d-%.4f.pkl' %
            (self.model_type, self.num_epochs, self.lr, self.num_epochs_decay,
             self.augmentation_prob))
        train_history_path = os.path.join(
            self.model_path, 'train-%s-%d-%.4f-%d-%.4f.csv' %
            (self.model_type, self.num_epochs, self.lr, self.num_epochs_decay,
             self.augmentation_prob))
        valid_history_path = os.path.join(
            self.model_path, 'valid-%s-%d-%.4f-%d-%.4f.csv' %
            (self.model_type, self.num_epochs, self.lr, self.num_epochs_decay,
             self.augmentation_prob))
        if os.path.isfile(unet_path):
            self.unet.load_state_dict(torch.load(unet_path))
            print('%s is Successfully Loaded from %s' %
                  (self.model_type, unet_path))
        else:
            lr = self.lr
            best_unet_score = 0.
            train_history = []
            valid_history = []
            for epoch in range(self.num_epochs):

                self.unet.train(True)
                epoch_loss = 0

                acc = 0.  # Accuracy
                SE = 0.  # Sensitivity (Recall)
                SP = 0.  # Specificity
                PC = 0.  # Precision
                F1 = 0.  # F1 Score
                JS = 0.  # Jaccard Similarity
                DC = 0.  # Dice Coefficient
                length = 0

                for i, (images, GT) in enumerate(self.train_loader):
                    images = images.to(self.device)
                    GT = GT.to(self.device)

                    SR = self.unet(images)
                    SR_probs = F.sigmoid(SR)
                    SR_flat = SR_probs.view(SR_probs.size(0), -1)
                    GT_flat = GT.view(GT.size(0), -1)
                    loss = self.criterion(SR_flat, GT_flat)
                    epoch_loss += loss.item()

                    self.reset_grad()
                    loss.backward()
                    self.optimizer.step()

                    acc += get_accuracy(SR, GT)
                    SE += get_sensitivity(SR, GT)
                    SP += get_specificity(SR, GT)
                    PC += get_precision(SR, GT)
                    F1 += get_F1(SR, GT)
                    JS += get_JS(SR, GT)
                    DC += get_DC(SR, GT)
                    length += images.size(0)

                acc = acc / length
                SE = SE / length
                SP = SP / length
                PC = PC / length
                F1 = F1 / length
                JS = JS / length
                DC = DC / length


                print('Epoch [%d/%d], Loss: %.4f, \n[Training] Acc: %.4f, SE: %.4f, SP: %.4f, PC: %.4f, F1: %.4f, JS: %.4f, DC: %.4f' % (
                   epoch+1, self.num_epochs, \
                   epoch_loss,\
                   acc, SE, SP, PC, F1, JS, DC))
                train_history.append([acc, SE, SP, PC, F1, JS, DC])

                if (epoch + 1) > (self.num_epochs - self.num_epochs_decay):
                    lr -= (self.lr / float(self.num_epochs_decay))
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = lr
                    print('Decay learning rate to lr: {}.'.format(lr))

                #===================================== Validation ====================================#
                self.unet.train(False)
                self.unet.eval()

                acc = 0.  # Accuracy
                SE = 0.  # Sensitivity (Recall)
                SP = 0.  # Specificity
                PC = 0.  # Precision
                F1 = 0.  # F1 Score
                JS = 0.  # Jaccard Similarity
                DC = 0.  # Dice Coefficient
                length = 0
                for i, (images, GT) in enumerate(self.valid_loader):

                    images = images.to(self.device)
                    GT = GT.to(self.device)
                    SR = F.sigmoid(self.unet(images))
                    acc += get_accuracy(SR, GT)
                    SE += get_sensitivity(SR, GT)
                    SP += get_specificity(SR, GT)
                    PC += get_precision(SR, GT)
                    F1 += get_F1(SR, GT)
                    JS += get_JS(SR, GT)
                    DC += get_DC(SR, GT)

                    length += images.size(0)

                acc = acc / length
                SE = SE / length
                SP = SP / length
                PC = PC / length
                F1 = F1 / length
                JS = JS / length
                DC = DC / length
                unet_score = JS + DC

                print(
                    '[Validation] Acc: %.4f, SE: %.4f, SP: %.4f, PC: %.4f, F1: %.4f, JS: %.4f, DC: %.4f'
                    % (acc, SE, SP, PC, F1, JS, DC))
                valid_history.append([acc, SE, SP, PC, F1, JS, DC])
                '''
				torchvision.utils.save_image(images.data.cpu(),
											os.path.join(self.result_path,
														'%s_valid_%d_image.png'%(self.model_type,epoch+1)))
				torchvision.utils.save_image(SR.data.cpu(),
											os.path.join(self.result_path,
														'%s_valid_%d_SR.png'%(self.model_type,epoch+1)))
				torchvision.utils.save_image(GT.data.cpu(),
											os.path.join(self.result_path,
														'%s_valid_%d_GT.png'%(self.model_type,epoch+1)))
				'''

                if unet_score > best_unet_score:
                    best_unet_score = unet_score
                    best_epoch = epoch
                    best_unet = self.unet.state_dict()
                    print('Best %s model score : %.4f' %
                          (self.model_type, best_unet_score))
                    torch.save(best_unet, unet_path)
            train_history = pd.DateFrame(
                train_history,
                columns=['acc', 'SE', 'SP', 'PC', 'F1', 'JS', 'DC'])
            valid_history = pd.DateFrame(
                valid_history,
                columns=['acc', 'SE', 'SP', 'PC', 'F1', 'JS', 'DC'])
            train_history.to_csv(train_history_path)
            valid_history.to_csv(valid_history_path)

            #===================================== Test ====================================#
            del self.unet
            del best_unet
            self.build_model()
            self.unet.load_state_dict(torch.load(unet_path))

            self.unet.train(False)
            self.unet.eval()

            acc = 0.  # Accuracy
            SE = 0.  # Sensitivity (Recall)
            SP = 0.  # Specificity
            PC = 0.  # Precision
            F1 = 0.  # F1 Score
            JS = 0.  # Jaccard Similarity
            DC = 0.  # Dice Coefficient
            length = 0
            for i, (images, GT) in enumerate(self.valid_loader):

                images = images.to(self.device)
                GT = GT.to(self.device)
                SR = F.sigmoid(self.unet(images))
                acc += get_accuracy(SR, GT)
                SE += get_sensitivity(SR, GT)
                SP += get_specificity(SR, GT)
                PC += get_precision(SR, GT)
                F1 += get_F1(SR, GT)
                JS += get_JS(SR, GT)
                DC += get_DC(SR, GT)

                length += images.size(0)

            acc = acc / length
            SE = SE / length
            SP = SP / length
            PC = PC / length
            F1 = F1 / length
            JS = JS / length
            DC = DC / length
            unet_score = JS + DC

            f = open(os.path.join(self.result_path, 'result.csv'),
                     'a',
                     encoding='utf-8',
                     newline='')
            wr = csv.writer(f)
            wr.writerow([
                self.model_type, acc, SE, SP, PC, F1, JS, DC, self.lr,
                best_epoch, self.num_epochs, self.num_epochs_decay,
                self.augmentation_prob
            ])
            f.close()
Example #23
0
 def __init__(self):
     pd = pandas.DateFrame({'id': [1, 2,3], 'name': ['jack', 'nancy']})
     excel = pd.to_excel('report.xlsx')
Example #24
0
_sorted.iloc[0]  # {'a': 3, 'b': 5}  (         top item)
_sorted.index[0] # 2                 (index of top item)

# filter by content
df = pd.DataFrame({'foo':[1,2,3,4,5,6], 'bar':[9,8,7,6,5,4]})
grouped = df.groupby('foo')
grouped.filter(lambda i: i['foo'] > 3)

# filter by labels (not on contents)
df = pd.DataFrame([[1,2,3], [4,5,6]], index=['mouse','rabbit'], columns=['one','two','three'])
df.filter(items=['one', 'three'])
df.filter(regex='e$', axis=1)
df.filter(like='bbi', axis=0)

# drop
df = pd.DateFrame([ [1,2], [3,4], [5,6], [7,8] ])
df = df.drop([2,3])
df # [ [1,2], [3,4] ]

df = pd.DataFrame({'a': [1,2,3,4], 'b':[5,6,7,8], 'c':[9,10,11,12]})
df.drop(columns='c')
df # { 'a': [1,2,3,4], 'b':[5,6,7,8] }

# to csv
pd.DataFrame([ [1,2], [3,4], [5,6] ]).to_csv(index=False, header=None) # '1,2\r\n3,4\r\n5,6\r\n'
pd.DataFrame([ [1,2], [3,4], [5,6] ]).to_csv('myfile.csv', index=False, header=None)
pd.DataFrame([ [1,2], [3,4], [5,6] ]).to_csv('myfile.txt', sep='\t', index=False)

# count of csv rows
df = pd.DataFrame([ [1,2], [3,4], [5,6] ])
df.shape[0]   # 3
Example #25
0
ulli_total = []
company_temp = []
company = []
num = []
data = []
for c in range(0, len(li_code) - 1):
    ulli_total.append(li_code[c].get_text())
# print(ulli_total)
for a in range(0, len(ulli_total) - 1):
    company_temp.append(ulli_total[a].split('), '))
    company.append(company_temp[a][0] + ')')
    num.append(company_temp[a][0])
    data.append([company, num])
print(data)

df = pd.DateFrame(data, columns=["company", "양"])
df.to_csv('crawling.csv', encoding="utf-8")

# continent =[]
# country=[]

# print(headline_array)
# num_headline=0
# while(1):
#     if headline_array[num_headline].get_text()=='Africa':
#         num_country=0
#         num_country=num_headline+1
#         while(1):
#             if headline_array[num_country].get_text()=='Algeria':
#                 continent.append(headline_array[num_headline].get_text())
#                 country.append(headline_array[num_country].get_text())
Example #26
0
	("enc", OrdinalEncoder(handle_unknown = ‘ignore’))
])

categorical_pipeline = Pipeline([
	("imp", SimpleImputer(strategy= "most_frequent")),
	("enc", OneHotEncoder(sparse=True, handle_unknown = ‘ignore’))
])


pre_pipe = ColumnTransformer([
	("cat_pre", categorical_pipeline, categorical_features),
	("ord_pre", ordinal_pipeline, ordinal_features),
	("num_pre", numerical_pipeline, numerical_features)
])

model = sklearn.

full_pipe = Pipeline([
	("pre", pre_pipe),
	("model", model)
])
full_pipe.fit(train_x, train_y)
score = full_pipe.score(test_x, test_y)

y_hat = full_pipe.fit(test_x)

submission = pd.DateFrame()
submission.loc[:, "Id"] = df_test.loc[:, "Id"]
submission.loc[:, "SalePrice"] = y_hat
submission.write_csv("submission.csv")
# In[24]:

import numpy as np
a1 = pd.Series([1, 2, 3, 4, 5, (np.nan)])
print(a1)

# In[14]:

dates = pd.date_range("20190601", periods=5)
print(dates)

# ##

# In[15]:

a2 = pd.DateFrame(np.random.randn(6, 4), index=dates, coloumns=list)

# In[23]:

a2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20190601'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': "foo"
})
print(a2)

# In[27]:
Example #28
0
#_*_ coding:utf-8 _*_
import pandas as pd
import matplotlib as plt
import seaborn as sns

#数据包含四项,创建时间,课程名称,学习人数,学习时间,用逗号分隔,用pandas读取出来成为DataFrame的格式
courses=pd.read_table('courses.txt',sep=',',header=0)

#用to_datetime将创建时间提取出来,用作新表的索引,用旧表数据,但是会多出一行创建时间,所以要删除

i=pd.to_datetime(courses['创建时间'])
courses_ts=pd.DateFrame(data=courses.values,columns=courses.columns,index=i)
courses_ts=courses_ts.drop('创建时间',axis=1)

#对数据进行降频处理,用周次,并进行求和
courses_ts_W=courses_ts.resample('W').sum()

#用matplotlib绘制图形用一个函数
def mat_figure():
    plt.plot_date(courses_ts_W.index,courses_ts_W['学习时间'],'-')
    plt.xlabel('Time Series')
    plt.ylabel('Study Time')
    plt.show()
#用matplotlib绘制的图形比较乱,不能清楚反映趋势

#用seaborn绘制
def sea_figure():
    #引入一个序数列,方便绘制散点图
    courses_ts_W['id']=range(0,len(courses_ts_W.index.values))
    #这里用到了seaborn的绘图方式regplot,首先写入xy的参数,指定数据来源,设置散点图的参数,禁用置信区间绘制
    sns.regplot('id','学习时间',data=courses_ts_W,scatter_kws={'s':10},order=5,ci=None,truncate=True)
print('# ...GENERATING TIMESERIES ON THE DATASET... #')

shops_li = df.Shop_id.unique()
items_li = df.Item_id.unique()

#cols = ['Item_Category_Name', 'Item_Price', 'Item_Cnt_Day']

with open('data/unique_months.json') as json_file:
    unique_months_dict = json.load(json_file)

raw_data_final = []
with tqdm(total=12) as pbar_files:
    for element in range(1, 13):
        train_final = []
        with tqdm(total=len(shops_li)) as pbar_shops:
            for shop in shops_li:
                with tqdm(total=len(items_li)) as pbar_items:
                    for item in items_li:
                        key = '{};{}'.format(item, shop)
                        if key in unique_months_dict:
                            train_final = getTimeSeriesDataSet(
                                df, shop, item, element, train_final)
                        pbar_items.update(1)
                pbar_shops.update(1)
        pbar_files.update(1)
        print('# ...WRITTING FILE OF SERIE {}... # '.format(element))
        df_final = pd.DateFrame(train_final)
        df_final.to_csv('train_final_{}_series.csv'.format(element),
                        sep=';',
                        index=False)
def write_out_genome_coverage(ncbi_genomes_totals, genomic_accession_dict,
                              time_stamp, args):
    """Write out the genome coverage of NCBI GenBank database by the local CAZyme database
    
    :param ncbi_genomes_totals: dict {kingdom: number of NCBI GenBank genomes
    :param genomic_accession_dict: dict
        {kingdom: {genus: {species: {accession: {proteins: set(), counts: int}}}}
    :param time_stamp: str, date and time script was invoked
    :param args: cmd-line args parser
    
    Return nothing
    """
    column_names = [
        'Kingdom', 'NCBI_genomes', 'CAZy_genomes', 'Coverage_percent'
    ]
    coverage_df = pd.DataFrame(columns=column_names)
    graph_columns = ['Kingdom', 'NCBI', 'CAZy']
    graph_df = pd.DateFrame(columns=graph_columns)

    for kingdom in KINGDOMS:
        ncbi = ncbi_genomes_totals[kingdom]

        cazy = 0
        genera = genomic_accession_dict[kingdom]
        for genus in genera:
            organisms = genera[genus]
            for species in organisms:
                species_genome_accessions = len(list(
                    organisms[species].keys()))
                cazy += species_genome_accessions

        coverage = (cazy / ncbi) * 100

        row_data = [kingdom, ncbi, cazy, coverage]
        new_row = pd.DataFrame([row_data], columns=column_names)
        coverage_df = coverage_df.append(new_row)

        row_data = [kingdom, int(ncbi), int(cazy)]
        new_row = pd.DataFrame([row_data], columns=graph_columns)
        graph_df = graph_df.append(new_row)

    output_path = args.output_dir / f"cazy_genbank_genome_coverage_{time_stamp}.csv"
    coverage_df.to_csv(output_path)

    fig, ax = plt.subplots()
    # plot CAZy bars
    ax.bar(
        graph_df['Kingdom'],
        graph_df['CAZy'],
        label='CAZy',
        color='orange',
    )
    # add NCBI bars (the higher bars)
    ax.bar(
        graph_df['Kingdom'],
        graph_df['NCBI'],
        bottom=graph_df['CAZy'],
        label='NCBI',
        color='dodgerblue',
    )
    ax.set_ylabel('Kingdom')
    ax.set_xlabel('Number of genomes in the database')
    ax.set_title('GenBank genomes included in CAZy')

    ax.legend()

    output_path = args.output_dir / f"gbk_cazy_genomes_plot_{time_stamp}.png"
    fig.savefig(output_path, bbox_inches='tight', dpi=360)

    return