def main(args): # Read the arguments df_f = args.filename pval_file = args.pval_file sample_file = args.sample_file col_to_drop = args.col threshold = float(args.threshold) workers = args.workers print "Config:" print "Input data frame file name:", df_f print "Output pvalue file", pval_file print "Output sample file", sample_file print "Columns to drop", col_to_drop print "Threshold", threshold # Read the time series data norm_df = pd.read_csv(df_f) # Drop the column if needed. We typically drop the 1st column as it always is 0 by # default. if col_to_drop in norm_df.columns: cols = norm_df.columns.tolist() if col_to_drop == norm_df.columns[-1]: time_points = cols[2:] new_cols = cols[0:2] + time_points[::-1] norm_df = norm_df[new_cols] norm_df.drop(col_to_drop, axis=1, inplace=True) print "Dropped column", col_to_drop print "Columns of the data frame are", norm_df.columns cwords = norm_df.word.values print "Number of words we are analyzing:", len(cwords) chunksz = np.ceil(len(cwords) / float(workers)) results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B) pvals, num_samples = zip(*results) header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1]) pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header) # Append additonal columns to the final df pvalue_df_final = pvalue_df.copy(deep=True) pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1)) pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold)) pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True) # Write the pvalue output. num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header) num_samples_df.to_csv(sample_file, encoding='utf-8') # Write the sample output sdf = pvalue_df_final.sort_values(by=['tpval']) sdf.to_csv(pval_file, encoding='utf-8')
def main(args): # Read the arguments df_f = args.filename pval_file = args.pval_file sample_file = args.sample_file col_to_drop = args.col threshold = float(args.threshold) workers = args.workers print "Config:" print "Input data frame file name:", df_f print "Output pvalue file", pval_file print "Output sample file", sample_file print "Columns to drop", col_to_drop print "Threshold", threshold # Read the time series data norm_df = pd.read_csv(df_f) # Drop the column if needed. We typically drop the 1st column as it always is 0 by # default. if col_to_drop in norm_df.columns: cols = norm_df.columns.tolist() if col_to_drop == norm_df.columns[-1]: time_points = cols[2:] new_cols = cols[0:2] + time_points[::-1] norm_df = norm_df[new_cols] norm_df.drop(col_to_drop, axis=1, inplace=True) print "Dropped column", col_to_drop print "Columns of the data frame are", norm_df.columns cwords = norm_df.word.values print "Number of words we are analyzing:", len(cwords) chunksz = np.ceil(len(cwords) / float(workers)) results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B) pvals, num_samples = zip(*results) header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1]) pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header) # Append additonal columns to the final df pvalue_df_final = pvalue_df.copy(deep=True) pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1)) pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold)) pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True) # Write the pvalue output. num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header) num_samples_df.to_csv(sample_file, encoding='utf-8') # Write the sample output sdf = pvalue_df_final.sort(columns=['tpval']) sdf.to_csv(pval_file, encoding='utf-8')
def main(args): # Read the arguments df_f = args.filename common_vocab_file = args.vocab_file pval_file = args.pval_file col_to_drop = args.col should_normalize = not args.dont_normalize n_jobs = int(args.workers) cp_pval = args.dump_pval if args.threshold is not None: threshold = float(args.threshold) else: threshold = None print "CONFIG:" print "FILENAME:", df_f print "VOCAB FILE:", common_vocab_file print "PVAL_FILE:", pval_file print "COL TO DROP:", col_to_drop print "NORMALIZE:", should_normalize print "Threshold", threshold # Read the time series data df = pd.read_csv(df_f) # Restrict only to the common vocabulary. df = get_filtered_df(df, common_vocab_file) # Normalize the data frame if should_normalize: norm_df = normalize_timeseries(df) else: norm_df = df # Drop a column if needed. if col_to_drop in norm_df.columns: cols = df.columns.tolist() if col_to_drop == norm_df.columns[-1]: time_points = cols[2:] new_cols = cols[0:2] + time_points[::-1] norm_df = norm_df[new_cols] print norm_df.columns norm_df.drop(col_to_drop, axis=1, inplace=True) print "Columns of the time series", norm_df.columns cwords = norm_df.word.values print "Number of words we are processing", len(cwords) chunksz = np.ceil(len(cwords) / float(n_jobs)) if cp_pval: results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=n_jobs, df=norm_df, threshold=threshold) cps, pvals = zip(*results) # R returns 1 for a very high stat significance. So we invert it as for # us low pvalues mean more significance. pvals = [(1.0 - pval) for pval in pvals] actual_cps = [get_actual_cp(norm_df, cp) for cp in cps] results = zip(cwords, actual_cps, pvals) header = ['word', 'cp', 'pval'] pvalue_df = pd.DataFrame().from_records(results, columns=header) sdf = pvalue_df.sort(columns=['pval']) sdf.to_csv(pval_file, encoding='utf-8', index=None) else: results = parallelize_func(cwords[:], get_cp_word_chunk, chunksz=chunksz, n_jobs=n_jobs, df=norm_df) cps = results actual_cps = [get_actual_cp(norm_df, cp) for cp in cps] results = zip(cwords, actual_cps) header = ['word', 'cp'] pvalue_df = pd.DataFrame().from_records(results, columns=header) sdf = pvalue_df.sort(columns=['cp']) sdf.to_csv(pval_file, encoding='utf-8', index=None)