Example #1
0
def edit(args):
    cfg = config.validate(args)
    samples = validate(cfg)
    # make changes based on args
    print('WARNING: not implemented')
    save(cfg, samples)
    return
Example #2
0
def remove(args):
    cfg = config.validate(args)
    samples = validate(cfg)
    if args.name in samples.name.values:
        samples = samples[samples.name != args.name]
    else:
        raise RuntimeError('sample {} not found in sample info '
                           'file'.format(args.name))
    save(cfg, samples)
    return
Example #3
0
def add(args):
    cfg = config.validate(args)
    samples = validate(cfg)
    errors = False
    # make sure the sample doesn't already exist
    if args.name in samples.name.values:
        print('ERROR: sample {} already exists'.format(args.name))
        errors = True
    # make sure the fastq files are present
    for filename in args.fastq_files:
        if not os.path.exists(filename):
            print('ERROR: fastq file {} does not exist'.format(filename))
            errors = True
    fastq_str = ','.join(args.fastq_files)
    # compute md5sums of the fastq files
    md5sums = []
    if not errors:
        for filename in args.fastq_files:
            print('computing md5sum for file {}'.format(filename))
            m = hashlib.md5()
            with open(filename, 'rb') as f:
                while True:
                    chunk = f.read(CHUNK_SIZE)
                    if chunk == b'':
                        break
                    m.update(chunk)
            md5sums.append(m.hexdigest())
        md5sums_str = ','.join(md5sums)
        # update created date
        added = pd.Timestamp.now()
        sample_log = [{
            'ts':
            added.strftime('%c'),
            'msg':
            f'sample added {args.name}, '
            f'{args.group}, {fastq_str}, {md5sums_str}, {args.description}'
        }]
        samples.loc[len(samples)] = [
            args.name, args.group, fastq_str, args.description, added, None,
            None, md5sums_str,
            json.dumps(sample_log)
        ]
    if errors:
        raise RuntimeError('at least one failure occured while '
                           'adding the sample')
    save(cfg, samples)
    return
Example #4
0
def extract(args):
    '''
    After running htseq-count on a set of specified RNA-seq samples, 
    combine all of their summary count files into one master dataframe
    of gene rows and experiment sample columns.
    '''

    # validate config and retrieve specified samples (use all by default)
    cfg = config.validate(args)
    samples = sample.validate(cfg)

    os.chdir(os.path.join(cfg.project_dir, "workspace"))

    # If sample names were provided from the command line, check if
    # it exists in the config
    if args.samples is not None:
        for smp in args.samples:
            # raise error if sample name provided was not in the config
            if smp not in samples['name'].values:
                raise KeyError(
                    f"Invalid sample name",
                    f"The sample {smp} is not contained in the list of valid samples."
                )

        # filter to only the samples passed in the args
        samples = samples.loc[samples['name'].isin(args.samples)]

    # initialize empty df to collect experiment sample columns
    master_df = None
    # initialize a row counter for catching changes in upstream software
    rows = 0

    # for every sample
    for smp in samples['name'].values:
        # load up its summary count file
        summary_file = os.path.join(smp, smp + '.summary.dat')

        # If this is the first sample being added, initiialize df
        if master_df is None:
            master_df = pd.read_csv(summary_file,
                                    sep='\t',
                                    names=['locus_tag', smp])
            rows = master_df.shape[0]

        # otherwise, merge this new sample in with the master df
        else:
            # make a temp df with the new experiment file
            tmp_df = pd.read_csv(summary_file,
                                 sep='\t',
                                 names=['locus_tag', smp])

            # check if this experiment has a different number of rows in the summary
            # (if so, something probably changed in upstream counting software)
            if rows != tmp_df.shape[0]:
                raise ValueError(
                    "Data frame size mismatch",
                    f"The number of rows in the master data frame is {rows} and the number of rows in the data frame for smp {smp} contains {tmp_df.shape[0]}"
                )

            # merge in the new experiment, joining on the gene names
            master_df = master_df.merge(tmp_df, how='outer', on='locus_tag')

            # check again for a row count mismatch, just to be sure
            if rows != master_df.shape[0]:
                raise (
                    ValueError, "Data frame size mismatch",
                    f"The number of rows in the old master data frame is {rows} and the number of rows in the merged data frame including smp {smp} contains {master_df.shape[0]}"
                )

    # drop locus rows like "__no_feature", "__ambiguous", "__too_low_quality" from df
    actual_loci = [x for x in master_df['locus_tag'] if not x.startswith("__")]
    master_df = master_df.loc[master_df['locus_tag'].isin(actual_loci)]

    # add gene info to master df
    add_gene_info_to_master(master_df, cfg.reference_gb_path)

    # convert to tpm if requested
    if args.values == "TPM":
        master_df, tpm_cols = calculate_tpm(master_df, samples['name'].values)

    # save final merged df a tsv
    print(f'saving output to {args.output}')
    master_df.to_csv(args.output, sep='\t', header=True, index=False)

    return
Example #5
0
def run(args):
    cfg = config.validate(args)
    samples = sample.validate(cfg)

    os.chdir(os.path.join(cfg.project_dir, "workspace"))

    # now the system is ready to go with a populated dataframe with sample info
    # and all of the system configuration data on the cfg object
    # the prints below demonstrate what attributes are on cfg and what
    # the schema of the sample_info table is

    if args.samples is not None:
        samples = samples.loc[samples['name'].isin(args.samples)]
    if args.samples is not None:
        for smp in args.samples:
            if smp not in samples['name'].values:
                raise KeyError(
                    f"Invalid sample name",
                    f"The sample {smp} is not contained in the list of valid samples."
                )
    samples = samples.loc[samples['name'].isin(args.samples)]

    # If save_as_scripts is true, don't run anything, but put it all in a bash file
    # Update run date?
    faidx = cfg.reference_fasta_path + ".fai"
    try:
        f = open(faidx, 'r')
        f.close()
    except FileNotFoundError:
        faidx_cmd = "{} faidx {}".format(cfg.samtools_path,
                                         cfg.reference_fasta_path)
        run_cmd(faidx_cmd)

    samples['bwa_cmd'] = samples.apply(lambda x: make_bwa_cmd(x, cfg), axis=1)
    samples['view_cmd'] = samples['name'].map(lambda x: make_view_cmd(x, cfg))
    samples['sort_cmd'] = samples['name'].map(lambda x: make_sort_cmd(x, cfg))
    samples['index_cmd'] = samples['name'].map(
        lambda x: make_index_cmd(x, cfg))
    samples['htseq_cmd'] = samples['name'].map(
        lambda x: make_htseq_cmd(x, cfg))

    cmd_list = []
    for step in ['bwa_cmd', 'view_cmd', 'sort_cmd', 'index_cmd', 'htseq_cmd']:
        cmd_list.append(samples[step].tolist())

    if args.save_as_scripts:
        with open("barrelseq.sh", 'w') as b:
            b.write("#!/bin/bash\n\n")
            for step in cmd_list:
                for specific_command in step:
                    b.write(specific_command)
                    b.write("\n")
                b.write("\n")
    else:
        for step in cmd_list:
            if args.processes is None:
                args.processes = 1
            if len(step) < args.processes:
                args.processes = len(step)

            if args.processes > 1:
                with mp.Pool(processes=args.processes) as pool:
                    output = "\n".join(pool.map(run_cmd, step))
            else:
                output = ""
                for specific_command in step:
                    output += run_cmd(specific_command)
            print(output)

    if not args.save_intermediate_files:
        samples['name'].map(remove_intermediates)

    return
Example #6
0
def view(args):
    cfg = config.validate(args)
    samples = validate(cfg)
    sample_info = samples.loc[samples['name'] == args.name]
    print(sample_info)
    return