def edit(args): cfg = config.validate(args) samples = validate(cfg) # make changes based on args print('WARNING: not implemented') save(cfg, samples) return
def remove(args): cfg = config.validate(args) samples = validate(cfg) if args.name in samples.name.values: samples = samples[samples.name != args.name] else: raise RuntimeError('sample {} not found in sample info ' 'file'.format(args.name)) save(cfg, samples) return
def add(args): cfg = config.validate(args) samples = validate(cfg) errors = False # make sure the sample doesn't already exist if args.name in samples.name.values: print('ERROR: sample {} already exists'.format(args.name)) errors = True # make sure the fastq files are present for filename in args.fastq_files: if not os.path.exists(filename): print('ERROR: fastq file {} does not exist'.format(filename)) errors = True fastq_str = ','.join(args.fastq_files) # compute md5sums of the fastq files md5sums = [] if not errors: for filename in args.fastq_files: print('computing md5sum for file {}'.format(filename)) m = hashlib.md5() with open(filename, 'rb') as f: while True: chunk = f.read(CHUNK_SIZE) if chunk == b'': break m.update(chunk) md5sums.append(m.hexdigest()) md5sums_str = ','.join(md5sums) # update created date added = pd.Timestamp.now() sample_log = [{ 'ts': added.strftime('%c'), 'msg': f'sample added {args.name}, ' f'{args.group}, {fastq_str}, {md5sums_str}, {args.description}' }] samples.loc[len(samples)] = [ args.name, args.group, fastq_str, args.description, added, None, None, md5sums_str, json.dumps(sample_log) ] if errors: raise RuntimeError('at least one failure occured while ' 'adding the sample') save(cfg, samples) return
def extract(args): ''' After running htseq-count on a set of specified RNA-seq samples, combine all of their summary count files into one master dataframe of gene rows and experiment sample columns. ''' # validate config and retrieve specified samples (use all by default) cfg = config.validate(args) samples = sample.validate(cfg) os.chdir(os.path.join(cfg.project_dir, "workspace")) # If sample names were provided from the command line, check if # it exists in the config if args.samples is not None: for smp in args.samples: # raise error if sample name provided was not in the config if smp not in samples['name'].values: raise KeyError( f"Invalid sample name", f"The sample {smp} is not contained in the list of valid samples." ) # filter to only the samples passed in the args samples = samples.loc[samples['name'].isin(args.samples)] # initialize empty df to collect experiment sample columns master_df = None # initialize a row counter for catching changes in upstream software rows = 0 # for every sample for smp in samples['name'].values: # load up its summary count file summary_file = os.path.join(smp, smp + '.summary.dat') # If this is the first sample being added, initiialize df if master_df is None: master_df = pd.read_csv(summary_file, sep='\t', names=['locus_tag', smp]) rows = master_df.shape[0] # otherwise, merge this new sample in with the master df else: # make a temp df with the new experiment file tmp_df = pd.read_csv(summary_file, sep='\t', names=['locus_tag', smp]) # check if this experiment has a different number of rows in the summary # (if so, something probably changed in upstream counting software) if rows != tmp_df.shape[0]: raise ValueError( "Data frame size mismatch", f"The number of rows in the master data frame is {rows} and the number of rows in the data frame for smp {smp} contains {tmp_df.shape[0]}" ) # merge in the new experiment, joining on the gene names master_df = master_df.merge(tmp_df, how='outer', on='locus_tag') # check again for a row count mismatch, just to be sure if rows != master_df.shape[0]: raise ( ValueError, "Data frame size mismatch", f"The number of rows in the old master data frame is {rows} and the number of rows in the merged data frame including smp {smp} contains {master_df.shape[0]}" ) # drop locus rows like "__no_feature", "__ambiguous", "__too_low_quality" from df actual_loci = [x for x in master_df['locus_tag'] if not x.startswith("__")] master_df = master_df.loc[master_df['locus_tag'].isin(actual_loci)] # add gene info to master df add_gene_info_to_master(master_df, cfg.reference_gb_path) # convert to tpm if requested if args.values == "TPM": master_df, tpm_cols = calculate_tpm(master_df, samples['name'].values) # save final merged df a tsv print(f'saving output to {args.output}') master_df.to_csv(args.output, sep='\t', header=True, index=False) return
def run(args): cfg = config.validate(args) samples = sample.validate(cfg) os.chdir(os.path.join(cfg.project_dir, "workspace")) # now the system is ready to go with a populated dataframe with sample info # and all of the system configuration data on the cfg object # the prints below demonstrate what attributes are on cfg and what # the schema of the sample_info table is if args.samples is not None: samples = samples.loc[samples['name'].isin(args.samples)] if args.samples is not None: for smp in args.samples: if smp not in samples['name'].values: raise KeyError( f"Invalid sample name", f"The sample {smp} is not contained in the list of valid samples." ) samples = samples.loc[samples['name'].isin(args.samples)] # If save_as_scripts is true, don't run anything, but put it all in a bash file # Update run date? faidx = cfg.reference_fasta_path + ".fai" try: f = open(faidx, 'r') f.close() except FileNotFoundError: faidx_cmd = "{} faidx {}".format(cfg.samtools_path, cfg.reference_fasta_path) run_cmd(faidx_cmd) samples['bwa_cmd'] = samples.apply(lambda x: make_bwa_cmd(x, cfg), axis=1) samples['view_cmd'] = samples['name'].map(lambda x: make_view_cmd(x, cfg)) samples['sort_cmd'] = samples['name'].map(lambda x: make_sort_cmd(x, cfg)) samples['index_cmd'] = samples['name'].map( lambda x: make_index_cmd(x, cfg)) samples['htseq_cmd'] = samples['name'].map( lambda x: make_htseq_cmd(x, cfg)) cmd_list = [] for step in ['bwa_cmd', 'view_cmd', 'sort_cmd', 'index_cmd', 'htseq_cmd']: cmd_list.append(samples[step].tolist()) if args.save_as_scripts: with open("barrelseq.sh", 'w') as b: b.write("#!/bin/bash\n\n") for step in cmd_list: for specific_command in step: b.write(specific_command) b.write("\n") b.write("\n") else: for step in cmd_list: if args.processes is None: args.processes = 1 if len(step) < args.processes: args.processes = len(step) if args.processes > 1: with mp.Pool(processes=args.processes) as pool: output = "\n".join(pool.map(run_cmd, step)) else: output = "" for specific_command in step: output += run_cmd(specific_command) print(output) if not args.save_intermediate_files: samples['name'].map(remove_intermediates) return
def view(args): cfg = config.validate(args) samples = validate(cfg) sample_info = samples.loc[samples['name'] == args.name] print(sample_info) return