def main(): # configure the logger here # default to error, --verbose flag will set to INFO log = logging.getLogger(__name__) log.setLevel(logging.ERROR) log_formatter = logging.Formatter( 'CRISPR_SCORE|%(levelname)s|: %(message)s') stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.ERROR) stream_handler.setFormatter(log_formatter) log.addHandler(stream_handler) # import the custom parser parser = CRISPRArgumentParser( description=("Find & profile CRISPR targets in a fasta file"), ) args = parser.parse_args() if args.verbose: log.setLevel(logging.INFO) stream_handler.setLevel(logging.INFO) # validate files # fasta must be given if not file_exists(args.fasta): log.error("fasta file %s does not exist -- Exiting", args.fasta) raise SystemExit # GFF is optional if args.gff is not None and not file_exists(args.gff): log.error("gff file %s does not exist -- Exiting", args.gff) raise SystemExit # set up data fasta = read_fasta_file(args.fasta) # the kmer_spectra is a dictionary mapping kmers to a count score # the score = (1 / count) * multiplyer, so a kmer which appears once # has a base score of one, whereas a more common one (5) would have .2 kmer_size = args.seed_end - args.seed_start + 1 kmer_specta = build_kmer_count(fasta, k=kmer_size) targets = [] # yields a Crispir target seq instance for target in generate_targets(fasta, target_length=args.length): log.info("Target found: %s", target) # filter and score if filter_target(target, gc_low=args.gc_low, gc_high=args.gc_high, homopolymer_length=args.homopolymer): # pass the scoring parameters from args Namespace as a dict target.score = score_target(target, kmer_specta, **args.__dict__) targets.append(target) log.info("Filtering/Scoring complete, %i targets passed", len(targets)) if len(targets) == 0: log.warn("No targets passed filtering, exiting!") return # optionally check the gff file # overlaps are appened to the CrisprTarget genes[] data member if args.gff: log.info("Reading in GFF annotations") gff = read_gff_file(args.gff) log.info("Found %i genes from %s", len(gff), args.gff) if len(gff) > 0: targets = annotate_gene_overlaps(targets, gff) # write the output as a bed file write_bed_file(targets, fasta, args.output)
def test_reading_invalid_gff(self): self.assertTrue(file_exists(self.invalid_gff_file)) gff = read_gff_file(self.invalid_gff_file) # the gene line is invalid so its skipped self.assertEqual(len(gff), 0)
def test_reading_valid_gff(self): self.assertTrue(file_exists(self.valid_gff_file)) gff = read_gff_file(self.valid_gff_file) self.assertEqual(len(gff), 1) self.assertEqual(gff["ID=gene1;Name=GENE1"], (1000, 7000))
def test_reading_invalid_fasta(self): self.assertTrue(file_exists(self.invalid_fasta_file)) self.assertRaises(ValueError, read_fasta_file, self.invalid_fasta_file)
def test_reading_valid_fasta(self): self.assertTrue(file_exists(self.valid_fasta_file)) rec = read_fasta_file(self.valid_fasta_file) self.assertEqual(rec.id, "valid_fasta")
def test_file_doesnt_exist(self): self.assertFalse(file_exists(self.bad_file))