Example #1
0
def apply_backtesting(bettor, param_grid, risk_factors, X, scores, odds, cv, random_state, n_runs, n_jobs):
    """Apply backtesting to evaluate bettor."""
    
    # Check random states
    random_states = check_random_states(random_state, n_runs)

    # Check arrays
    X = check_array(X, dtype=None, force_all_finite=False)
    normalized_scores = []
    for score in scores:
        normalized_scores.append(check_array(score, dtype=None, ensure_2d=False))
    odds = check_array(odds, dtype=None)

    # Extract parameters
    parameters = ParameterGrid(param_grid)

    # Run backtesting
    data = Parallel(n_jobs=n_jobs)(delayed(fit_bet)(bettor, params, risk_factors, random_state, X, normalized_scores, odds, train_indices, test_indices) 
           for params, random_state, (train_indices, test_indices) in tqdm(list(product(parameters, random_states, cv.split(X))), desc='Tasks'))
    
    # Combine data
    data = pd.concat(data, ignore_index=True)
    data = data.groupby(['parameters', 'risk_factor', 'experiment']).apply(lambda df: np.concatenate(df.yields.values)).reset_index()
    data[['coverage', 'mean_yield', 'std_yield']] = pd.DataFrame(data[0].apply(lambda yields: extract_yields_stats(yields)).values.tolist())
    
    # Calculate results
    results = data.drop(columns=['experiment', 0]).groupby(['parameters', 'risk_factor']).mean().reset_index()
    results['std_mean_yield'] = data.groupby(['parameters', 'risk_factor'])['mean_yield'].std().values
    results = results.sort_values('mean_yield', ascending=False).reset_index(drop=True)

    return results
Example #2
0
 def find_TADs(self,
               data,
               gammalist=range(10, 110, 10),
               segmentation='potts',
               minlen=3,
               drop_gamma=False,
               n_jobs='auto'):
     '''
     Finds TADs in data with a list of gammas. Returns a pandas DataFrame
     with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on
     the returned object to get coordinates in bed-style format and not in
     coordinates of concatenated genome.
     If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma)
     '''
     raise DeprecationWarning('Will be deprecated or rewritten to use'\
                             'lavaburst: github.com/nezar-compbio/lavaburst')
     if n_jobs is 'auto':  #Empirical values on my computer; with >8 Gb memory try increasing n_jobs
         if segmentation == 'potts':
             n_jobs = 3
         elif segmentation == 'armatus':
             n_jobs = 6
     if ~np.isfinite(data).any():
         print 'Non-finite values in data, substituting them with zeroes'
         data[~np.isfinite(data)] = 0
     Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data)
     f = _calculate_TADs
     if n_jobs >= 1:
         from joblib import Parallel, delayed
         domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)(
             delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation)
             for g in gammalist)
     elif n_jobs is None or n_jobs == False or n_jobs == 0:
         domains = []
         for g in gammalist:
             domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation)
             domains.append(domains_g)
     domains = pd.concat(domains, ignore_index=True)
     domains = domains.query('End-Start>=' + str(minlen)).copy()
     domains = domains.sort(columns=['Gamma', 'Start', 'End'])
     domains.reset_index(drop=True, inplace=True)
     domains[['Start', 'End']] = domains[['Start', 'End']].astype(int)
     domains[['Start', 'End']] *= self.resolution
     domains = domains[['Start', 'End', 'Score', 'Gamma']]
     if drop_gamma:
         domains.drop('Gamma', axis=1, inplace=True)
     domains = self.genome_intervals_to_chr(domains).reset_index(drop=True)
     return domains
Example #3
0
 def find_TADs(self, data, gammalist=range(10, 110, 10), segmentation='potts',
               minlen=3, drop_gamma=False, n_jobs='auto'):
     '''
     Finds TADs in data with a list of gammas. Returns a pandas DataFrame
     with columns 'Start', 'End' and 'Gamma'. Use genome_intervals_to_chr on
     the returned object to get coordinates in bed-style format and not in
     coordinates of concatenated genome.
     If *drop_gamma*, drops the 'Gamma' column (useful when using 1 gamma)
     '''
     raise DeprecationWarning('Will be deprecated or rewritten to use'\
                             'lavaburst: github.com/nezar-compbio/lavaburst')
     if n_jobs is 'auto': #Empirical values on my computer; with >8 Gb memory try increasing n_jobs
         if segmentation == 'potts':
             n_jobs = 3
         elif segmentation == 'armatus':
             n_jobs = 6
     if ~np.isfinite(data).any():
         print 'Non-finite values in data, substituting them with zeroes'
         data[~np.isfinite(data)] = 0
     Wcomm, Wnull, pass_mask, length = _precalculate_TADs_in_array(data)
     f = _calculate_TADs
     if n_jobs >= 1:
         from joblib import Parallel, delayed
         domains = Parallel(n_jobs=n_jobs, max_nbytes=1e6)(
                           delayed(f)(Wcomm, Wnull, pass_mask, length, g, segmentation)
                                                                    for g in gammalist)
     elif n_jobs is None or n_jobs == False or n_jobs == 0:
         domains = []
         for g in gammalist:
             domains_g = f(Wcomm, Wnull, pass_mask, length, g, segmentation)
             domains.append(domains_g)
     domains = pd.concat(domains, ignore_index=True)
     domains = domains.query('End-Start>='+str(minlen)).copy()
     domains = domains.sort(columns=['Gamma', 'Start', 'End'])
     domains.reset_index(drop=True, inplace=True)
     domains[['Start', 'End']] = domains[['Start', 'End']].astype(int)
     domains[['Start', 'End']] *= self.resolution
     domains = domains[['Start', 'End', 'Score', 'Gamma']]
     if drop_gamma:
         domains.drop('Gamma', axis=1, inplace=True)
     domains = self.genome_intervals_to_chr(domains).reset_index(drop=True)
     return domains
        return time_since_spray

    num_cores = multiprocessing.cpu_count()
    inputs = testtrapsweather.index

    distance_binary = Parallel(n_jobs=num_cores)(delayed(distance_calc)(i)
                                                 for i in inputs)
    time_binary = Parallel(n_jobs=num_cores)(delayed(time_calc)(i)
                                             for i in inputs)
    print('make binaries... done')

    distance_binary = pd.DataFrame(distance_binary)
    time_binary = pd.DataFrame(time_binary)

    time_binary.reset_index(inplace=True)
    time_binary.drop('index', axis=1, inplace=True)

    # if observation took place before spray, zero out time
    # else return elapsed time between spray and observation

    print('negating sprays after traps...')

    for col in time_binary.columns:
        time_binary[col] = time_binary[col].map(lambda x: 0 if x < 0 else x)
    print('done')

    # https://chrisalbon.com/python/data_wrangling/pandas_rename_multiple_columns/
    time_binary.columns = distance_binary.columns

    time_binary_backup = time_binary.copy()
    distance_binary_backup = distance_binary.copy()
Example #5
0
def token_text(text):
    """
    Tokenize and cleanup text
    """
    # split into words using NLTK's word tokenizer
    words = tokenize.word_tokenize(text)
    # remove punctuation signs and convert to lowercase
    words = [w.lower() for w in words if w.isalpha()]
    return words

# get conversation data
convo_date = Parallel(n_jobs=12)(delayed(convo2df)(p)
                for p in tqdm(glob.glob('../data/*frompdf.txt')))
convo_date = pd.concat(convo_date, ignore_index=False)
# authorship is not relevant here
convo_date = convo_date.drop('author', axis=1)

convo_txt = Parallel(n_jobs=12)(delayed(convo2df)(p)
                for p in tqdm(glob.glob('../data/*txt')) if 'pdf' not in p)
convo_txt = pd.concat(convo_txt, ignore_index=False)

convo_df = pd.merge(convo_txt, convo_date, on='text')
# drop duplicates that originate from merging info from pdf and with old dfs
convo_df = convo_df.drop_duplicates(['text','datetime','author'])

# tokenize texts
convo_df['words'] = convo_df.text.apply(token_text)
convo_df[['polarity','subject']] = convo_df.words.apply(lambda x:\
                                    pd.Series(TextBlob(' '.join(x)).sentiment))

convo_df.to_csv('../output/convo_01022020.csv', index=False)