Exemple #1
0
import gzip

def custom_db(db_gz, gt_23andMe_gz, n_snps=100000, out_file_gz='out_custom_db.txt.gz'):
    '''
+-----------+
| custom_db |
+-----------+

Script used to pull out SNPs present in database and 23andMe file, sort by F_ST, and write top n_snps to gzipped out_file

Package dependencies:
pandas
commanderline
'''
    print('Reading database...')
    db=pd.read_csv(db_gz, compression='gzip', sep=' ')
    print('Reading 23andMe file...')
    gt=pd.read_csv(gt_23andMe_gz, compression='gzip', usecols=[1,2], sep='\t', comment='#', header=None)
    gt.columns=['chr','pos']
    print('Merging on chr&pos...')
    m=pd.merge(db, gt, on=['chr','pos'])
    print('Sorting by F_ST...')
    m=m.sort_values(by='fst', ascending=False)
    print('Filtering...')
    m_out=m.iloc[:n_snps, :(len(m.columns)-1)]
    print('Sorting by chr & pos, and writing to file...')
    with gzip.open(out_file_gz, 'wt') as f_out:
        m_out.sort_values(by=['chr','pos']).to_csv(f_out, sep=' ', index=False)

cl.commander_line(custom_db) if __name__ == '__main__' else None
    if (red == 'avg'):
        avg = True
        raise Exception(
            'Average used, but its output is not always consistent - especially for diag!'
        )
    elif (red == 'sum'):
        avg = False
    else:
        raise Exception('Error: Unknown argument: ' + red)

    # Output is done step-by-step
    # analysis.write_output_to_file(out_fname+'.txt.gz', cnst.const['out_delim'], avg)

    if generate_img:
        analysis.generate_img(out_fname + cnst.const['img_out_ext'])

    flat.print_log_msg('Done')


def print_opt_arg_error():
    print('For help use --help')
    return (2)


# if __name__ == '__main__':
#     main()

if __name__ == '__main__':
    cl.commander_line((pipeline_lean, pipeline, multiple_pipelines))
commanderline
'''

     results=pd.DataFrame()

     with open(file_list, 'rt') as f_in:
          for i,f in enumerate(f_in):
               f=f.strip()
               d1=pd.read_csv(f, header=None, sep=' ')
               d1['super']=d1[0].apply(lambda el: ancestry_translation[el])
               d2=d1.groupby('super').sum()
               d2.columns=[i]

               results=pd.concat([results, d2], axis=1)

     results=results.dropna(subset=[0])

     bootstrap_replicate_count=results.iloc[:,1:].apply(lambda el: el>=negligible_threshold).T.sum()/(len(results.columns)-1)

     filt=bootstrap_replicate_count.apply(lambda el: el>=bootstrap_confidence)

     interm_results=results[filt][0]

     ambig=1-interm_results.sum()

     final_results=pd.concat([interm_results, pd.Series({'AMBIGUOUS':ambig})])

     print(final_results)

cl.commander_line(validate_bootstrap) if __name__ == '__main__' else None
Exemple #4
0
def calc_fst(ref_gz, out_gz='out_db_fst.txt.gz'):
    '''
+----------+
| calc_fst |
+----------+

Helper script to calculate F_ST for Ancestry-format reference files. Output is the input file with an additional column named 'fst' (SNPs with NaN for F_ST are removed)

Details about Ancestry:
https://bitbucket.org/joepickrell/ancestry

Package dependencies:
pandas
commanderline
'''
    print('Reading Ancestry-format reference file...')
    d = pd.read_csv(ref_gz, sep=' ')
    # Drop null columns, e.g., last column followed by a column delimiter will result in a null column
    d = d.dropna(axis=1, how='all')
    print('Calculating F_ST...')
    d['fst'] = d.iloc[:, 6:].apply(fst, axis=1)
    # Drop SNPs without F_ST
    d = d.dropna(subset=['fst'])
    print('Writing output file...')
    with gzip.open(out_gz, 'wt') as f_out:
        d.to_csv(f_out, sep=' ', index=False)


cl.commander_line(calc_fst) if __name__ == '__main__' else None
Exemple #5
0
    with open(file_list, 'rt') as f_in:
        for i, f in enumerate(f_in):
            f = f.strip()
            d1 = pd.read_csv(f, header=None, sep=' ')
            d1['super'] = d1[0].apply(lambda el: ancestry_translation[el])
            d2 = d1.groupby('super').sum()
            d2.columns = [i]

            results = pd.concat([results, d2], axis=1)

    results = results.dropna(subset=[0])

    bootstrap_replicate_count = results.iloc[:, 1:].apply(
        lambda el: el >= negligible_threshold).T.sum() / (
            len(results.columns) - 1)

    filt = bootstrap_replicate_count.apply(
        lambda el: el >= bootstrap_confidence)

    interm_results = results[filt][0]

    ambig = 1 - interm_results.sum()

    final_results = pd.concat(
        [interm_results, pd.Series({'AMBIGUOUS': ambig})])

    print(final_results)


cl.commander_line(validate_bootstrap) if __name__ == '__main__' else None
Exemple #6
0
def calc_fst(ref_gz, out_gz='out_db_fst.txt.gz'):
    '''
+----------+
| calc_fst |
+----------+

Helper script to calculate F_ST for Ancestry-format reference files. Output is the input file with an additional column named 'fst' (SNPs with NaN for F_ST are removed)

Details about Ancestry:
https://bitbucket.org/joepickrell/ancestry

Package dependencies:
pandas
commanderline
'''
    print('Reading Ancestry-format reference file...')
    d=pd.read_csv(ref_gz, sep=' ')
    # Drop null columns, e.g., last column followed by a column delimiter will result in a null column
    d=d.dropna(axis=1,how='all')
    print('Calculating F_ST...')
    d['fst']=d.iloc[:,6:].apply(fst, axis=1) 
    # Drop SNPs without F_ST
    d=d.dropna(subset=['fst'])
    print('Writing output file...')
    with gzip.open(out_gz, 'wt') as f_out:
        d.to_csv(f_out, sep=' ', index=False)

cl.commander_line(calc_fst) if __name__ == '__main__' else None

Exemple #7
0
Script used to pull out SNPs present in database and 23andMe file, sort by F_ST, and write top n_snps to gzipped out_file

Package dependencies:
pandas
commanderline
'''
    print('Reading database...')
    db = pd.read_csv(db_gz, compression='gzip', sep=' ')
    print('Reading 23andMe file...')
    gt = pd.read_csv(gt_23andMe_gz,
                     compression='gzip',
                     usecols=[1, 2],
                     sep='\t',
                     comment='#',
                     header=None)
    gt.columns = ['chr', 'pos']
    print('Merging on chr&pos...')
    m = pd.merge(db, gt, on=['chr', 'pos'])
    print('Sorting by F_ST...')
    m = m.sort_values(by='fst', ascending=False)
    print('Filtering...')
    m_out = m.iloc[:n_snps, :(len(m.columns) - 1)]
    print('Sorting by chr & pos, and writing to file...')
    with gzip.open(out_file_gz, 'wt') as f_out:
        m_out.sort_values(by=['chr', 'pos']).to_csv(f_out,
                                                    sep=' ',
                                                    index=False)


cl.commander_line(custom_db) if __name__ == '__main__' else None
    print_metric(metric1)
    flat.print_log_msg('Breakpoint 2: ' + repr(breakpoint2))
    flat.print_log_msg('Metric 2:')
    print_metric(metric2)


def apply_metric(chr_name, begin, end, input_config, loci):
    metric_out = metric.Metric(chr_name, input_config, loci, begin, end)
    out = metric_out.calc_metric()

    return out


def print_metric(metric_out):
    flat.print_log_msg('Sum: ' + repr(metric_out['sum']))
    flat.print_log_msg('N (w/ zero\'s): ' + repr(metric_out['N_zero']))
    flat.print_log_msg('Metric: ' +
                       repr(metric_out['sum'] / metric_out['N_zero']))


def print_opt_arg_error():
    print('For help use --help')
    return (2)


# if __name__ == '__main__':
#     main()

if __name__ == '__main__':
    cl.commander_line((pipeline))
            dist = -0.1

            for key in loaded:
                # print()
                # print(key, loaded[key])
                try:
                    if 'loci' in loaded[key]:
                        dist -= 0.05
                        breakpoint_list_y = [dist for a in loaded[key]['loci']]
                        pt.scatter(np.array(loaded[key]['loci']),
                                   np.array(breakpoint_list_y),
                                   c=next(colors),
                                   label=key,
                                   marker=next(markers))
                except TypeError as e:
                    print(e)

        pt.legend()
        pt.show()


def plot_gwas_w_default_paths(chr_name, input_pickle_fname):
    plot_gwas(
        cnst.const['gwas_plots']['height']['root'] +
        cnst.const['gwas_plots']['height']['file_prefix'] + chr_name +
        cnst.const['gwas_plots']['height']['file_suffix'], input_pickle_fname)


if __name__ == '__main__':
    cl.commander_line((plot_gwas_w_default_paths, plot_gwas))
    # input_config = cnst.const['orig_data_'+dataset]
    input_config = cnst.return_conf(dataset_path)

    partitions = flat.read_partitions(name, input_config)

    with open(input_pickle_fname, 'rb') as f_in:
        loaded = pickle.load(f_in)

        # print(loaded)

        loci = loaded[subset]['loci']

        first = partitions[0][0]
        last = partitions[len(partitions) - 1][1]

        # print(loci)

        print('chr', '\t', 'start', '\t', 'stop')

        print(name, '\t', first, '\t', loci[0])

        for i in range(0, len(loci) - 1):
            print(name, '\t', loci[i], '\t', loci[i + 1])

        print(name, '\t', loci[len(loci) - 1], '\t', last + 1)


if __name__ == '__main__':
    cl.commander_line((chr_bpoints_to_bed, ),
                      print_argv_to_output=False,
                      print_done=False)
    if (red == 'avg'):
        avg = True
        raise Exception(
            'Average used, but its output is not always consistent - especially for diag!'
        )
    elif (red == 'sum'):
        avg = False
    else:
        raise Exception('Error: Unknown argument: ' + red)

    # Output is done step-by-step
    # analysis.write_output_to_file(out_fname+'.txt.gz', cnst.const['out_delim'], avg)

    if generate_img:
        analysis.generate_img(out_fname + cnst.const['img_out_ext'])

    flat.print_log_msg('Done')


def print_opt_arg_error():
    print('For help use --help')
    return (2)


# if __name__ == '__main__':
#     main()

if __name__ == '__main__':
    cl.commander_line((pipeline_lean, ))