예제 #1
0
def main(args):
    # Read the arguments
    df_f = args.filename
    pval_file = args.pval_file
    sample_file = args.sample_file
    col_to_drop = args.col
    threshold = float(args.threshold)
    workers = args.workers
    print "Config:"
    print "Input data frame file name:", df_f
    print "Output pvalue file", pval_file
    print "Output sample file", sample_file
    print "Columns to drop", col_to_drop
    print "Threshold", threshold

    # Read the time series data
    norm_df = pd.read_csv(df_f)

    # Drop the column if needed. We typically drop the 1st column as it always is 0 by
    # default.
    if col_to_drop in norm_df.columns:
        cols = norm_df.columns.tolist()
        if col_to_drop == norm_df.columns[-1]:
            time_points = cols[2:]
            new_cols = cols[0:2] + time_points[::-1]
            norm_df = norm_df[new_cols]
        norm_df.drop(col_to_drop, axis=1, inplace=True)
        print "Dropped column", col_to_drop

    print "Columns of the data frame are", norm_df.columns
    cwords = norm_df.word.values
    print "Number of words we are analyzing:", len(cwords)

    chunksz = np.ceil(len(cwords) / float(workers))
    results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B)

    pvals, num_samples = zip(*results)

    header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1])
    pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header)

    # Append additonal columns to the final df
    pvalue_df_final = pvalue_df.copy(deep=True)

    pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1))
    pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold))

    pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True)

    # Write the pvalue output.
    num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header)
    num_samples_df.to_csv(sample_file, encoding='utf-8')

    # Write the sample output
    sdf = pvalue_df_final.sort_values(by=['tpval'])
    sdf.to_csv(pval_file, encoding='utf-8')
예제 #2
0
def main(args):
    # Read the arguments
    df_f = args.filename
    pval_file = args.pval_file
    sample_file = args.sample_file
    col_to_drop = args.col
    threshold = float(args.threshold)
    workers = args.workers
    print "Config:"
    print "Input data frame file name:", df_f
    print "Output pvalue file", pval_file
    print "Output sample file", sample_file
    print "Columns to drop", col_to_drop
    print "Threshold", threshold

    # Read the time series data
    norm_df = pd.read_csv(df_f)

    # Drop the column if needed. We typically drop the 1st column as it always is 0 by
    # default.
    if col_to_drop in norm_df.columns:
        cols = norm_df.columns.tolist()
        if col_to_drop == norm_df.columns[-1]:
            time_points = cols[2:]
            new_cols = cols[0:2] + time_points[::-1]
            norm_df = norm_df[new_cols]
        norm_df.drop(col_to_drop, axis=1, inplace=True)
        print "Dropped column", col_to_drop

    print "Columns of the data frame are", norm_df.columns
    cwords = norm_df.word.values
    print "Number of words we are analyzing:", len(cwords)

    chunksz = np.ceil(len(cwords) / float(workers))
    results = parallelize_func(cwords[:], get_pval_word_chunk, chunksz=chunksz, n_jobs=workers, df=norm_df, B=args.B)

    pvals, num_samples = zip(*results)

    header = ['word'] + list(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1])
    pvalue_df = pd.DataFrame().from_records(list(pvals), columns=header)

    # Append additonal columns to the final df
    pvalue_df_final = pvalue_df.copy(deep=True)

    pvalue_df_final['min_pval'], pvalue_df_final['cp'] = zip(*pvalue_df.apply(get_minpval_cp, axis=1))
    pvalue_df_final['tpval'], pvalue_df_final['tcp'] = zip(*pvalue_df.apply(get_cp_pval, axis=1, zscore_df=norm_df, threshold=threshold))

    pvalue_df_final.drop(norm_df.columns[TS_OFFSET:len(pvals[0]) + 1], axis=1, inplace = True)

    # Write the pvalue output.
    num_samples_df = pd.DataFrame().from_records(list(num_samples), columns=header)
    num_samples_df.to_csv(sample_file, encoding='utf-8')

    # Write the sample output
    sdf = pvalue_df_final.sort(columns=['tpval'])
    sdf.to_csv(pval_file, encoding='utf-8')
예제 #3
0
def main(args):
    # Read the arguments
    df_f = args.filename
    common_vocab_file = args.vocab_file
    pval_file = args.pval_file
    col_to_drop = args.col
    should_normalize = not args.dont_normalize
    n_jobs = int(args.workers)
    cp_pval = args.dump_pval
    if args.threshold is not None:
        threshold = float(args.threshold)
    else:
        threshold = None

    print "CONFIG:"
    print "FILENAME:", df_f
    print "VOCAB FILE:", common_vocab_file
    print "PVAL_FILE:", pval_file
    print "COL TO DROP:", col_to_drop
    print "NORMALIZE:", should_normalize
    print "Threshold", threshold

    # Read the time series data
    df = pd.read_csv(df_f)
    # Restrict only to the common vocabulary.
    df = get_filtered_df(df, common_vocab_file)

    # Normalize the data frame
    if should_normalize:
        norm_df = normalize_timeseries(df)
    else:
        norm_df = df

    # Drop a column if needed.
    if col_to_drop in norm_df.columns:
        cols = df.columns.tolist()
        if col_to_drop == norm_df.columns[-1]:
            time_points = cols[2:]
            new_cols = cols[0:2] + time_points[::-1]
            norm_df = norm_df[new_cols]
            print norm_df.columns
        norm_df.drop(col_to_drop, axis=1, inplace=True)

    print "Columns of the time series", norm_df.columns
    cwords = norm_df.word.values
    print "Number of words we are processing", len(cwords)

    chunksz = np.ceil(len(cwords) / float(n_jobs))
    if cp_pval:
        results = parallelize_func(cwords[:],
                                   get_pval_word_chunk,
                                   chunksz=chunksz,
                                   n_jobs=n_jobs,
                                   df=norm_df,
                                   threshold=threshold)
        cps, pvals = zip(*results)
        # R returns 1 for a very high stat significance. So we invert it as for
        # us low pvalues mean more significance.
        pvals = [(1.0 - pval) for pval in pvals]
        actual_cps = [get_actual_cp(norm_df, cp) for cp in cps]
        results = zip(cwords, actual_cps, pvals)
        header = ['word', 'cp', 'pval']
        pvalue_df = pd.DataFrame().from_records(results, columns=header)
        sdf = pvalue_df.sort(columns=['pval'])
        sdf.to_csv(pval_file, encoding='utf-8', index=None)
    else:
        results = parallelize_func(cwords[:],
                                   get_cp_word_chunk,
                                   chunksz=chunksz,
                                   n_jobs=n_jobs,
                                   df=norm_df)
        cps = results
        actual_cps = [get_actual_cp(norm_df, cp) for cp in cps]
        results = zip(cwords, actual_cps)
        header = ['word', 'cp']
        pvalue_df = pd.DataFrame().from_records(results, columns=header)
        sdf = pvalue_df.sort(columns=['cp'])
        sdf.to_csv(pval_file, encoding='utf-8', index=None)