def test_cmd_all_ok_long_options(self, cmd_all_ok_long_options) -> None: # Function tests how `parse_arguments` parses correct long options # Backup argv buff_argv: List[str] = sys.argv # Set test argv sys.argv = cmd_all_ok_long_options # Parse arguments params: args.HighlighterParams = args.parse_arguments() # Restore argv argv = buff_argv assert params.target_fasta_fpath == cmd_all_ok_long_options[2] assert params.bam_fpath == cmd_all_ok_long_options[4] assert params.outfpath == cmd_all_ok_long_options[6] assert params.suppress_zero_cov_output == True assert params.topology == 'circular' assert params.organism == cmd_all_ok_long_options[12] obtained_threshold_repr: str = ','.join( map(lambda covthr: str(covthr.get_coverage()), params.coverage_thresholds)) expected_threshold_repr: str = cmd_all_ok_long_options[8] assert obtained_threshold_repr == expected_threshold_repr
def test_cmd_bam_missing(self, cmd_bam_missing) -> None: # Function tests how `parse_arguments` parses command line # with positional arguments # Backup argv buff_argv: List[str] = sys.argv # Set test argv sys.argv = cmd_bam_missing # Parse arguments with pytest.raises(SystemExit): params: args.HighlighterParams = args.parse_arguments() # end with # Restore argv argv = buff_argv
def main(): # Get the arguments for the current execution. args = parse_arguments() print(args) # Read the training manifest tsv file. df = pd.read_csv(args.manifest, sep='\t') # Define the scoring function and apply it to the manifest. scoring_function = ScoringFunction().create(args) df = scoring_function(df) # Create the directory for output manifest storage. os.makedirs(args.out_dir, exist_ok=True) # Save the sorted manifest. df.to_csv(os.path.join(args.out_dir, 'sorted_manifest.tsv'), sep='\t', index=None)
def main(): # Get the arguments for the current execution. args = parse_arguments() print(args) # Read the training manifest tsv file. df = pd.read_csv(args.manifest, sep='\t') # Define the pacing function and apply it to the sorted manifest. pacing_function = PacingFunction().create(args) df_list = pacing_function(df) # Create the directory for output smaller datasets storage. if not os.path.isdir(args.out_dir): os.mkdir(args.out_dir) df_paths_list = [] for index, sub_df in enumerate(df_list): path = os.path.join(args.out_dir, f'train_{index}.tsv') sub_df.to_csv(path, sep='\t', index=None) df_paths_list.append(path) if args.fairseq: train_fairseq(args, df_paths_list)
def main(version: str, last_update_date: str) -> None: # Parse arguments params: HighlighterParams = parse_arguments() # This string will be used for annotation of result GenBank file base_feature_note: str = f'generated by consensus-highlighter v{version}' # String for storing info about warnings with_warnings: str = '' # Read fasta records from input file print('Importing fasta from `{}`...'.format(params.target_fasta_fpath), end=' ') sys.stdout.flush() fasta_records: Sequence[SeqRecord] = pfr.parse_fasta_reference( params.target_fasta_fpath) print('done') # Create ouput directory _create_outdir_from_outfile(params.outfpath) out.create_or_emply_file(params.outfpath) # Obtain path to coverage file coverage_fpath: str = out.conf_path_to_depth_file(params.outfpath) # Count coverages with samtools depth print('Silently counting coverages with `samtools depth`...', end=' ') sys.stdout.flush() cov_fpath: str = oc.count_cov_for_all_refs(params.target_fasta_fpath, params.bam_fpath, coverage_fpath) print('done\n') # Proceed with annotation rec: SeqRecord for rec in fasta_records: print(f'Processing sequence `{rec.description}`') # Obtain coverages for current sequence cov_array: CoverageArray = oc.get_coverage_for_reference( rec.id, cov_fpath) # Check length of the coverage array if len(cov_array) == 0: print( f'! Warning: no coverage information found for sequence `{rec.id}`.' ) print( f"""! Please, make sure that field `RNAME` (3-rd column) in your BAM file contains ! id of this sequence specified in fasta header (i.e. `{rec.id}`).""") print('! Omitting this sequence.') print('=' * 10) with_warnings = ' with warnings' continue # end if if len(cov_array) != len(rec.seq): print( f"""! Warning: length of sequence `{rec.id}` ({len(rec.seq)} bp) ! is not equal to number of coverage positions ({len(cov_array)}) reported by `samtools depth` ! and stored in coverage file `{cov_fpath}`.""") print( '! Re-creating the bam file might be the solution of this issue.' ) print('! Omitting this sequence.') print('=' * 10) with_warnings = ' with warnings' continue # end if mean_coverage = round(sts.mean(cov_array.coverages), 2) print(f'Average coverage: {mean_coverage}') cov_threshold: CoverageThreshold coverage_features: MutableSequence[SeqFeature] # Detect all necessary coverage features for cov_threshold in params.coverage_thresholds: print( f'Screening the sequence for regions with {cov_threshold.get_label()}...', end=' ') sys.stdout.flush() # Get coverage features coverage_features = hlft.highlight_coverage_features( cov_array, cov_threshold, base_feature_note) coverage_features = ddf.dedupl_features(coverage_features, rec.features) # Append features to list rec.features.extend(coverage_features) print('done') # end for print(f'Writing annotated sequence to `{params.outfpath}`...', end=' ') sys.stdout.flush() # Write result GanBank record out.write_genbank_output(rec, params.topology, params.organism, params.outfpath) print('done') print('=' * 10) # end for print(f'Completed{with_warnings}!')