Beispiel #1
0
def main():

    # configure the logger here
    # default to error, --verbose flag will set to INFO
    log = logging.getLogger(__name__)
    log.setLevel(logging.ERROR)
    log_formatter = logging.Formatter(
        'CRISPR_SCORE|%(levelname)s|: %(message)s')

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.ERROR)
    stream_handler.setFormatter(log_formatter)
    log.addHandler(stream_handler)

    # import the custom parser
    parser = CRISPRArgumentParser(
        description=("Find & profile CRISPR targets in a fasta file"), )

    args = parser.parse_args()

    if args.verbose:
        log.setLevel(logging.INFO)
        stream_handler.setLevel(logging.INFO)

    # validate files
    # fasta must be given
    if not file_exists(args.fasta):
        log.error("fasta file %s does not exist -- Exiting", args.fasta)
        raise SystemExit

    # GFF is optional
    if args.gff is not None and not file_exists(args.gff):
        log.error("gff file %s does not exist -- Exiting", args.gff)
        raise SystemExit

    # set up data
    fasta = read_fasta_file(args.fasta)

    # the kmer_spectra is a dictionary mapping kmers to a count score
    # the score = (1 / count) * multiplyer, so a kmer which appears once
    # has a base score of one, whereas a more common one (5) would have .2
    kmer_size = args.seed_end - args.seed_start + 1
    kmer_specta = build_kmer_count(fasta, k=kmer_size)
    targets = []

    # yields a Crispir target seq instance
    for target in generate_targets(fasta, target_length=args.length):
        log.info("Target found: %s", target)

        # filter and score
        if filter_target(target,
                         gc_low=args.gc_low,
                         gc_high=args.gc_high,
                         homopolymer_length=args.homopolymer):

            # pass the scoring parameters from args Namespace as a dict
            target.score = score_target(target, kmer_specta, **args.__dict__)
            targets.append(target)

    log.info("Filtering/Scoring complete, %i targets passed", len(targets))

    if len(targets) == 0:
        log.warn("No targets passed filtering, exiting!")
        return

    # optionally check the gff file
    # overlaps are appened to the CrisprTarget genes[] data member
    if args.gff:
        log.info("Reading in GFF annotations")
        gff = read_gff_file(args.gff)
        log.info("Found %i genes from %s", len(gff), args.gff)
        if len(gff) > 0:
            targets = annotate_gene_overlaps(targets, gff)

    # write the output as a bed file
    write_bed_file(targets, fasta, args.output)
    def test_reading_invalid_gff(self):
        self.assertTrue(file_exists(self.invalid_gff_file))
        gff = read_gff_file(self.invalid_gff_file)

        # the gene line is invalid so its skipped
        self.assertEqual(len(gff), 0)
    def test_reading_valid_gff(self):
        self.assertTrue(file_exists(self.valid_gff_file))
        gff = read_gff_file(self.valid_gff_file)

        self.assertEqual(len(gff), 1)
        self.assertEqual(gff["ID=gene1;Name=GENE1"], (1000, 7000))
 def test_reading_invalid_fasta(self):
     self.assertTrue(file_exists(self.invalid_fasta_file))
     self.assertRaises(ValueError, read_fasta_file, self.invalid_fasta_file)
 def test_reading_valid_fasta(self):
     self.assertTrue(file_exists(self.valid_fasta_file))
     rec = read_fasta_file(self.valid_fasta_file)
     self.assertEqual(rec.id, "valid_fasta")
 def test_file_doesnt_exist(self):
     self.assertFalse(file_exists(self.bad_file))