Ejemplo n.º 1
0
    def __determine_sequences(self, query_seqs, target_seqs):

        """Private method to assign the sequence file variables
        if necessary.
        :param query_seqs:
        :param target_seqs:
        :return:
        """

        if isinstance(query_seqs, str):
            assert os.path.exists(query_seqs)
            self.query_seqs = pyfaidx.Fasta(query_seqs)
        elif query_seqs is None:
            self.query_seqs = None
        else:
            self.logger.warn("Query type: %s", type(query_seqs))
            # assert "SeqIO.index" in repr(query_seqs)
            self.query_seqs = query_seqs

        self.target_seqs = []
        for target in target_seqs:
            if not os.path.exists(target):
                raise ValueError("{} not found!".format(target))
            self.target_seqs.append(pyfaidx.Fasta(target))

        return
Ejemplo n.º 2
0
    def test_write(self):
        """
        The the writing method of the fragment simulator.
        """
        self.__fragments = tempfile.mkstemp()[1]
        self.__chromosomes = tempfile.mkstemp()[1]
        self.__map = tempfile.mkstemp()[1]

        self.__simulator.write(self.__map, self.__fragments,
                               self.__chromosomes)

        # check if the correct number of fragment and chromosome
        # sequences was written
        fragment_fasta = pyfaidx.Fasta(self.__fragments)
        self.assertEqual(len(fragment_fasta.keys()),
                         self.__fragment_number + self.__unplaced_number)
        chromosome_fasta = pyfaidx.Fasta(self.__chromosomes)
        self.assertEqual(len(chromosome_fasta.keys()),
                         self.__chromosome_number)

        # check if a correct fragment map was written
        test_map = Map()
        test_map.read(self.__map)

        os.unlink(self.__fragments)
        os.unlink(self.__fragments + '.fai')
        os.unlink(self.__chromosomes)
        os.unlink(self.__chromosomes + '.fai')
        os.unlink(self.__map)
Ejemplo n.º 3
0
def load_fasta(names, *filepaths):
    """
    Load lazy FASTA records from one or multiple files without reading them into
    memory.

    Parameters
    ----------
    names : sequence of str
        Names of sequence records in FASTA file or files.
    filepaths : str
        Paths to one or more FASTA files to gather records from.

    Returns
    -------
    OrderedDict of sequence name -> sequence record

    """
    import pyfaidx
    if len(filepaths) == 0:
        raise ValueError("Need at least one file")

    if len(filepaths) == 1:
        fa = pyfaidx.Fasta(filepaths[0], as_raw=True)

    else:
        fa = {}
        for filepath in filepaths:
            fa.update(pyfaidx.Fasta(filepath, as_raw=True).records)

    records = OrderedDict((chrom, fa[chrom]) for chrom in names)
    return records
Ejemplo n.º 4
0
def SSSimulate(cores, haplotype, chromosome, start, end, error, coverage, length, indels, probability, insertsize, standarddev, output):

	#prepare region

	fa=pyfaidx.Fasta(os.path.abspath(haplotype))

	if chromosome not in fa.keys():

		message='Abort'
		return message

	chr_= fa[chromosome]
	seq = chr_[:len(chr_)].seq

	with open(os.path.abspath(output + '/region.tmp.fa'), 'w') as regionout:

		subprocess.call(['samtools', 'faidx', haplotype, chromosome + ':' + str(start) +  '-' +str(end)], stdout=regionout, stderr=open(os.devnull, 'wb'))

	regionfa=pyfaidx.Fasta(os.path.abspath(output + '/region.tmp.fa'))
	chrf=regionfa[chromosome + ':' + str(start) +  '-' +str(end)]
	seqfa=chrf[:len(chrf)].seq
	Ns=seqfa.count('N')

	if len(seq) < end-start:

		logging.warning(str(chromosome) + ' in haplotype ' + os.path.abspath(haplotype) + ' is shorter than region to simulate.')
		numreads= round((coverage*(len(seq)-Ns)) / length)/2 #calculate chosen coverage and divide by 2 'cause they are pairs

	else:

		numreads= round((coverage*(end-start-Ns)) / length)/2 
	
	#simulate reads

	subprocess.call(['wgsim', '-e', str(error), '-N', str(numreads), '-1', str(length), '-2', str(length), '-R', str(indels), '-X', str(probability), os.path.abspath(output + '/region.tmp.fa'), os.path.abspath(output + '/region.1.fq'), os.path.abspath(output + '/region.2.fq')], stderr=open(os.devnull, 'wb'), stdout=open(os.devnull, 'wb'))

	os.remove(os.path.abspath(output + '/region.tmp.fa'))
	os.remove(os.path.abspath(output + '/region.tmp.fa.fai'))

	#align to modified reference

	with open(os.path.abspath(output + '/region.tmp.sam'), 'w') as samout:

		subprocess.call(['bwa', 'mem', '-t', str(cores), haplotype, os.path.abspath(output + '/region.1.fq'), os.path.abspath(output + '/region.2.fq')], stdout=samout, stderr=open(os.devnull, 'wb'))

	with open(os.path.abspath(output + '/region.tmp.bam'), 'w') as bamout:

		subprocess.call(['samtools', 'view', '-b', os.path.abspath(output + '/region.tmp.sam')], stdout=bamout, stderr=open(os.devnull, 'wb'))

	os.remove(os.path.abspath(output + '/region.tmp.sam'))

	with open(os.path.abspath(output + '/region.tmp.srt.bam'), 'w') as srtbamout:

		subprocess.call(['samtools', 'sort', '-@', str(cores-1), os.path.abspath(output + '/region.tmp.bam')], stdout=srtbamout, stderr=open(os.devnull, 'wb'))

	os.remove(os.path.abspath(output + '/region.tmp.bam'))

	subprocess.call(['samtools', 'index', os.path.abspath(output + '/region.tmp.srt.bam')],stderr=open(os.devnull, 'wb'))
Ejemplo n.º 5
0
def ancestral_fasta(args):
    """subroutine for ancestor subcommand
    """
    # single chromosome fasta file for reference genome
    ref = pyfaidx.Fasta(args.reference, read_ahead=10000)
    # make a copy to build our ancestor for this chromosome
    copyfile(args.reference, args.output)
    anc = pyfaidx.Fasta(args.output, read_ahead=10000, mutable=True)
    # reference genome for outgroup species (all chromosomes)
    out = pyfaidx.Fasta(args.outgroup, read_ahead=10000)
    # outgroup to reference alignment chain file
    lo = LiftOver(args.chain)
    # snps database for the same chromosome
    vcf = cyvcf2.VCF(args.vcf)

    # change regions outside of callability mask to all N bases
    if args.bed:
        if args.bed == '-':
            bed = sys.stdin
        else:
            bed = open(args.bed, 'r')
        last_end = 0
        for line in bed:
            chrom, start, end = line.rstrip().split('\t')[:3]
            start = int(start)
            anc[chrom][last_end:start] = 'N' * (start - last_end)
            last_end = int(end)
        anc[chrom][last_end:len(anc[chrom])] = 'N' * (len(anc[chrom]) -
                                                      last_end)

    for variant in vcf:
        # change variants that are not biallelic SNPs to N bases
        if not (variant.is_snp and len(variant.ALT) == 1):
            anc[variant.CHROM][variant.start:variant.end] = 'N' * (
                variant.end - variant.start)
        else:
            out_coords = lo.convert_coordinate(variant.CHROM, variant.start)
            # change ambiguously aligning sites to N bases
            if out_coords is None or len(out_coords) != 1:
                anc[variant.CHROM][variant.start] = 'N'
            else:
                if variant.REF != ref[variant.CHROM][
                        variant.start].seq.upper():
                    raise ValueError(f'variant reference allele {variant.REF} '
                                     f'mismatches reference sequence '
                                     f'{ref[variant.CHROM][variant.start]}')
                out_chromosome, out_position, out_strand = out_coords[0][:3]
                out_allele = out[out_chromosome][out_position].seq
                # if negative strand, take reverse complement base
                if out_strand == '-':
                    out_allele = reverse_complement(out_allele)
                # and finally, polarize
                if out_allele.upper() == variant.ALT[0]:
                    anc[variant.CHROM][variant.start] = out_allele
                elif out_allele.upper() != variant.REF:
                    # triallelic
                    anc[variant.CHROM][variant.start] = 'N'
Ejemplo n.º 6
0
 def __init__(self, input_path, in_memory=False):
     """
     Constructs a new `Genome` object.
     """
     super(Genome, self).__init__()
     self.in_memory = in_memory
     if in_memory is True:
         fasta = pyfaidx.Fasta(input_path)
         self.data = {k: str(fasta[k][:].seq).upper() for k in fasta.keys()}
         fasta.close()
     else:
         self.data = pyfaidx.Fasta(input_path)
     self.chrom_len_dict = {k: len(self.data[k]) for k in self.data.keys()}
Ejemplo n.º 7
0
def parse_fasta(fa_file):
    _LOGGER.debug("Hashing {}".format(fa_file))
    try:
        fa_object = pyfaidx.Fasta(fa_file)
    except pyfaidx.UnsupportedCompressionFormat:
        # pyfaidx can handle bgzip but not gzip; so we just hack it here and
        # unzip the file for checksumming, then rezip it for the rest of the
        # asset build.
        # TODO: streamline this to avoid repeated compress/decompress
        os.system("gunzip {}".format(fa_file))
        fa_file_unzipped = fa_file.replace(".gz", "")
        fa_object = pyfaidx.Fasta(fa_file_unzipped)
        os.system("gzip {}".format(fa_file_unzipped))
    return fa_object
Ejemplo n.º 8
0
    def test_reorder(self):
        """
        Check if input sequences are properly reordered.
        """
        test = Reorder(self.__order)
        test.write(self.__input, self.__output, ignore_missing=True)
        # check if sequences are in the specified order
        input_fasta = pyfaidx.Fasta(self.__input)
        output_fasta = pyfaidx.Fasta(self.__output)
        present_seq = [x for x in test.order if x in input_fasta.keys()]
        self.assertEqual(present_seq, list(output_fasta.keys()))

        with self.assertRaises(BioformatsError):
            test.write(self.__input, self.__output, ignore_missing=False)
Ejemplo n.º 9
0
def parse_fasta(fa_file):
    try:
        fa_object = pyfaidx.Fasta(fa_file)
    except pyfaidx.UnsupportedCompressionFormat:
        # pyfaidx can handle bgzip but not gzip; so we just hack it here and
        # unzip the file for checksumming, then rezip it for the rest of the
        # asset build.
        # TODO: streamline this to avoid repeated compress/decompress
        # in refgenie we feed this function with uncompressed, newly built
        # FASTA file, so compression issues are not relevant
        os.system("gunzip {}".format(fa_file))
        fa_file_unzipped = fa_file.replace(".gz", "")
        fa_object = pyfaidx.Fasta(fa_file_unzipped)
        os.system("gzip {}".format(fa_file_unzipped))
    return fa_object
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Gather all the indels for the tag-targeted sites in a 40 bp window.')
    parser.add_argument('--bam_file',
                        help='Sorted bam file with the mapped reads',
                        required=True)
    parser.add_argument(
        '--primer_file',
        help=
        'Tab separated. A single line per target containing the closest primer to it.',
        required=True)
    parser.add_argument('--basename',
                        help='basename to be used',
                        required=True)
    parser.add_argument('--genome_reference',
                        help='Indexed Genome Reference',
                        required=True)
    parser.add_argument('--output_folder', help='output folder', required=True)
    args = parser.parse_args()

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    genome = pyfaidx.Fasta(args.genome_reference)

    print('*** Running indelsGathering ***', file=sys.stderr)
    storeIndels(args.bam_file, args.primer_file, args.basename, genome,
                args.output_folder)
Ejemplo n.º 11
0
    def __init__(
            self,
            submission_queue,
            logging_queue,
            fasta,
            identifier,
            fasta_out,
            gtf_out,
            tmpdir,
            lenient=False,
            # strand_specific=False,
            canonical_splices=(("GT", "AG"), ("GC", "AG"), ("AT", "AC")),
            log_level="WARNING"):

        super().__init__()
        self.__identifier = identifier
        # self.strand_specific = strand_specific
        self.canonical = canonical_splices
        self.log_level = log_level
        self.logger = None
        self.logging_queue = logging_queue
        self.name = "Checker-{0}".format(self.identifier)
        create_queue_logger(self)
        self.lenient = lenient
        self.__fasta = fasta
        self.submission_queue = submission_queue
        self.fasta = pyfaidx.Fasta(self.__fasta)
        self.fasta_out = os.path.join(
            tmpdir, "{0}-{1}".format(fasta_out, self.identifier))
        self.gtf_out = os.path.join(tmpdir,
                                    "{0}-{1}".format(gtf_out, self.identifier))
        self.logger.debug(self.canonical)
Ejemplo n.º 12
0
def SPARKcreateBam(DataFrame, outbam):
    fa = pyfaidx.Fasta('chr1.fa')
    dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]}
    dictSorted = DataFrame.take(DataFrame.count())
    fh = pysam.AlignmentFile(outbam, mode="wb", header=dict_fa)
    for i in range(0, DataFrame.count()):
        s = pysam.AlignedSegment(fh.header)
        if dictSorted[i].flag == 4:
            s.is_unmapped = True
            s.query_name = dictSorted[i].Rname
            s.query_sequence = dictSorted[i].seq
            s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)])
        else:
            s.is_unmapped = False
            s.reference_name = dictSorted[i].contig
            s.query_name = dictSorted[i].Rname
            s.query_sequence = dictSorted[i].seq
            s.reference_start = dictSorted[i].pos
            s.cigarstring = dictSorted[i].cigar
            s.is_reverse = True if dictSorted[i].flag == 16 else False
            s.mapping_quality = dictSorted[i].mapq
            s.set_tags([("MD", dictSorted[i].MDtag, "Z"), ("cs", dictSorted[i].cstag, "Z")])
            s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)])
        fh.write(s)
    fh.close()
    pysam.sort("-o", "test.srt.bam", "test.bam")
    pysam.index("test.srt.bam")
Ejemplo n.º 13
0
def create_Bam(alignments, outbam):
    fa = pyfaidx.Fasta('chr1.fa')
    dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]}
    alignmentsSorted = sorted(alignments, key = attrgetter('contig', 'pos'))
    fh=pysam.AlignmentFile(outbam, mode="wb", header=dict_fa)
    for i, subreads in enumerate(alignmentsSorted):
        s = pysam.AlignedSegment(fh.header)
        if subreads.flag == 4:
            s.is_unmapped = True
            s.query_name = subreads.Rname
            s.query_sequence = subreads.seq
            s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)])
        else:
            #s = pysam.AlignedSegment(fh.header)
            s.is_unmapped = False
            s.reference_name = subreads.contig
            s.query_name = subreads.Rname
            s.query_sequence = subreads.seq
            s.reference_start = subreads.pos
            s.cigarstring = subreads.cigar
            s.is_reverse = True if subreads.flag == 16 else False
            s.mapping_quality = subreads.mapq
            s.set_tags([("MD", subreads.MDtag, "Z"), ("cs", subreads.cstag, "Z")])
            s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)])
        fh.write(s)
    fh.close()
    pysam.sort("-o", "test.srt.bam", "test.bam")
    pysam.index("test.srt.bam")
Ejemplo n.º 14
0
    def __init__(self, input_path, features):
        """
        Constructs a new `FastaFeatures` object.
        """
        self.data = []
        fasta_files = os.listdir(input_path)
        valid_fastas = []
        for i, fasta_in in enumerate(fasta_files):
            if not '.fa' in fasta_in:
                continue
            if '.fai' in fasta_in:
                continue
            self.data.append(
                pyfaidx.Fasta(os.path.join(input_path, fasta_in),
                              duplicate_action="first"))
            valid_fastas.append(fasta_in)

        self.n_features = len(features)

        self.feature_index_dict = dict([
            (feat, index) for index, feat in enumerate(features)
        ])

        self.index_feature_dict = dict(list(enumerate(features)))

        self.file_index_dict = dict([
            (fasta, index) for index, fasta in enumerate(valid_fastas)
        ])

        self.index_file_dict = dict(list(enumerate(valid_fastas)))

        self._features = features
Ejemplo n.º 15
0
 def __init__(self, input_path):
     """
     Constructs a `Proteome` object.
     """
     self.proteome = pyfaidx.Fasta(input_path)
     self.prots = sorted(self.proteome.keys())
     self.len_prots = self._get_len_prots()
Ejemplo n.º 16
0
def main():
    args = parse_args()    
    ## read FASTA genome file
    fasta = pyfaidx.Fasta(args.genome)
    ## read VCF file
    vcf_reader = list(vcf.Reader(open(args.vcf, 'r')))
    ## commands
    #### SELECT COMMAND
    if args.command == 'select':
        print("read VCF and select variants located on annotated '"+args.selectionAnnotationType+"' genome regions...", end="")
        dbfnFile = 'currentgff.db'
        ## read GFF3 file
        if os.path.exists(dbfnFile):
            os.remove(dbfnFile)
        db = gffutils.create_db(args.annotation, dbfn=dbfnFile)
        ## list of selected type annotated region
        annotationRegionList = select_annotation_type(db, fasta, args.selectionAnnotationType)
        ## write variant VCF into annotated region
        vcf_writer_annotated = vcf.Writer(open(args.output_prefix+'_'+args.selectionAnnotationType+'.vcf', 'w'), vcf.Reader(open(args.vcf, 'r')))
        for variant in vcf_reader:
            for region in annotationRegionList:
                if variant_position_within(variant, region):
                    vcf_writer_annotated.write_record(variant)
                    break
    #### SYNONYMOUS COMMAND
    elif args.command == 'synonymous':
        print("read VCF and detect synonymous and non-synonymous coding variants...", end="")
        dbfnFile = 'currentgff.db'
        ## read GFF3 file
        if os.path.exists(dbfnFile):
            os.remove(dbfnFile)
        db = gffutils.create_db(args.annotation, dbfn=dbfnFile)
        ## From the genome(GFF3, FASTA),
        ## extract a list of CDS (coding sequences) objects
        cdsSeqList = dbfasta2CdsSeq(db, fasta)
        ## check wether variant is within a CDS
        vcf_writer_synonymous = vcf.Writer(open(args.output_prefix+'_synonymous.vcf', 'w'), vcf.Reader(open(args.vcf, 'r')))
        vcf_writer_non_synonymous = vcf.Writer(open(args.output_prefix+'_nonsynonymous.vcf', 'w'), vcf.Reader(open(args.vcf, 'r')))
        for variant in vcf_reader:
        #print(variant.CHROM, variant.POS, variant.REF, variant.ALT[0])
            for cdsSeq in cdsSeqList:        
                if variant_position_within(variant, cdsSeq):
                    #print("cds #", i)
                    #print(variant.CHROM,variant.POS, "|", cdsSeq.seqid, cdsSeq.start, cdsSeq.end)
                    if is_synonymous(variant, cdsSeq):                    
                        vcf_writer_synonymous.write_record(variant)
                    else:
                        vcf_writer_non_synonymous.write_record(variant)               
                    break 
    #### FLANK COMMAND      
    elif args.command == 'flank':
        print("read VCF and extract flanking sequences of variants from the genome...", end= "")
        windowsSize = args.windowsSize
        sequences = vcf_flanking_sequences(vcf_reader, fasta, windowsSize)
        with open(args.output_prefix+"_flanking.fasta", "w") as output_handle:
            SeqIO.write(sequences, output_handle, "fasta")
    else:
        print("Au revoir !")
        sys.exit(0)    
    print("done")
Ejemplo n.º 17
0
def worker(task_queue, output_queue, rest, fasta):
    cutting_idx, rest_seq = parse_rest(rest)
    rest_seq_rc = rc(rest_seq)
    faidx = pyfaidx.Fasta(fasta)

    while 1:
        chr_ = task_queue.get()
        if chr_ is None:
            log.debug("Process-%d done" % mp.current_process().pid)
            break

        seq = faidx[chr_][:].seq  # read sequence
        seq_len = len(seq)

        out_chunk = [0]
        for match in re.finditer(rest_seq, seq, re.IGNORECASE):
            out_chunk.append(match.start() + cutting_idx  # fragment start
                             )

        out_chunk.append(seq_len)
        output_queue.put((chr_, '+', out_chunk))

        if rest_seq_rc != rest_seq:  # find reverse complement restriction site
            out_chunk = []
            for match in re.finditer(rest_seq_rc, seq, re.IGNORECASE):
                out_chunk.append(match.start() + cutting_idx)
            output_queue.put((chr_, '-', out_chunk))
Ejemplo n.º 18
0
def main():
    fichiers = parse_args()
    dbfnFile = 'currentgff.db'

    ## read GFF3 file
    if os.path.exists(dbfnFile):
        os.remove(dbfnFile)
    db = gffutils.create_db(fichiers.genomeAnnotation, dbfn=dbfnFile)
    ## read FASTA genome file
    fasta = pyfaidx.Fasta(fichiers.genomeFasta)
    ## From the genome(GFF3, FASTA),
    cdsSeqList = dbfasta2CdsSeq(db, fasta)
    ## extract a list of CDS (coding sequences) objects
    ## read VCF file
    vcf_reader = list(vcf.Reader(open(fichiers.vcf, 'r')))
    ## check wether variant is within a CDS
    vcf_writer_synonymous = vcf.Writer(
        open(fichiers.outputPrefix + '_synonymous.vcf', 'w'),
        vcf.Reader(open(fichiers.vcf, 'r')))
    vcf_writer_non_synonymous = vcf.Writer(
        open(fichiers.outputPrefix + '_nonsynonymous.vcf', 'w'),
        vcf.Reader(open(fichiers.vcf, 'r')))
    for variant in vcf_reader:
        #print(variant.CHROM, variant.POS, variant.REF, variant.ALT[0])
        for cdsSeq in cdsSeqList:
            if variant_position_within(variant, cdsSeq):
                #print("cds #", i)
                #print(variant.CHROM,variant.POS, "|", cdsSeq.seqid, cdsSeq.start, cdsSeq.end)
                if is_synonymous(variant, cdsSeq):
                    vcf_writer_synonymous.write_record(variant)
                else:
                    vcf_writer_non_synonymous.write_record(variant)
                break
Ejemplo n.º 19
0
def canonical_transcripts(db, fasta_filename):
    import pyfaidx
    fasta = pyfaidx.Fasta(fasta_filename, as_raw=True)
    for gene in db.features_of_type('gene'):

        # exons_list will contain (CDS_length, total_length, transcript, [exons]) tuples.
        exon_list = []
        for ti, transcript in enumerate(db.children(gene, level=1)):
            cds_len = 0
            total_len = 0
            exons = list(db.children(transcript, level=1))
            for exon in exons:
                exon_length = len(exon)
                if exon.featuretype == 'CDS':
                    cds_len += exon_length
                total_len += exon_length

            exon_list.append((cds_len, total_len, transcript, exons))

        # If we have CDS, then use the longest coding transcript
        if max(i[0] for i in exon_list) > 0:
            best = sorted(exon_list)[0]
        # Otherwise, just choose the longest
        else:
            best = sorted(exon_list, lambda x: x[1])[0]

        print(best)

        canonical_exons = best[-1]
        transcript = best[-2]
        seqs = [i.sequence(fasta) for i in canonical_exons]
        yield transcript, ''.join(seqs)
Ejemplo n.º 20
0
    def test_build(self, build_kws, mocker):
        """Tests build using example files."""

        # Mock STAR call.
        mock = mocker.patch.object(star, 'star_index')

        # Build reference.
        indexer = star.StarIndexer()
        indexer.build(**build_kws)

        # Check if reference files exist.
        ref = star.StarReference(build_kws['output_dir'])

        assert ref.base_path.exists()
        assert ref.fasta_path.exists()
        assert ref.gtf_path.exists()
        assert ref.indexed_gtf_path.exists()
        # assert ref.index_path.exists()
        assert ref.transposon_name == 'T2onc'
        assert ref.transposon_path.exists()
        assert ref.features_path.exists()

        # Check presence of augmented reference sequences.
        refseq = pyfaidx.Fasta(str(ref.fasta_path))
        assert sorted(refseq.keys()) == ['1', '2', 'T2onc']

        # Check call to STAR for building the index.
        mock.assert_called_once_with(fasta_path=ref.fasta_path,
                                     gtf_path=ref.gtf_path,
                                     output_dir=ref.index_path,
                                     log_path=build_kws['output_dir'] /
                                     'star.log',
                                     overhang=100,
                                     threads=1)
Ejemplo n.º 21
0
    def create_vac(self, bam_filename: str, vcf_filename: str,
                   out_vac_filename: str, ref_fasta_filename: str,
                   skip_indels: bool):
        """
        BAM and VCF should use same reference genome.
        VCF must contain INFO column with sub-fields AC and AN.
        :param bam_filename: filename of the SAM/BAM file, from which the header is extracted
        :param vcf_filename: filename of the input VCF file
        :param out_vac_filename: filename of the output VAC file
        :param ref_fasta_filename: filename to reference FASTA file
        :param skip_indels: whether to skip indels and keep only SNPs
        """
        # TODO use fasta index / vcf header instead of BAM header

        # load the reference FASTA
        ref_fasta = None
        if ref_fasta_filename is not None:
            if self._verbose:
                print('--- Loading Reference Fasta ---')
            ref_fasta = pyfaidx.Fasta(ref_fasta_filename)

        # is VCF gzipped?
        # is_gzipped = vcf_filename.endswith(('.gz', '.bgz'))

        # open all files and create the VAC file
        if self._verbose:
            print('--- Processing VCF %s ---' % vcf_filename)
        with pysam.VariantFile(vcf_filename) as vcf_file, \
                open_bam(bam_filename, 'rb') as sam_file, \
                open(out_vac_filename, 'wb') as out_vac_file:
            vac = Vac(FastaIndex.from_bam(sam_file), self._verbose)
            vac.vcf2vac(vcf_file, out_vac_file, ref_fasta, skip_indels)
Ejemplo n.º 22
0
def main():
    """Initialize the logic of the program."""
    gtf_file, fasta_file, out_file = argv[1], argv[2], argv[3]

    gtf_db = gtf_file + "_gffutils.db"

    if not os.path.isfile(gtf_db):
        gffutils.create_db(gtf_file,
                           dbfn=gtf_db,
                           disable_infer_genes=True,
                           disable_infer_transcripts=True)

    db = gffutils.FeatureDB(gtf_db)
    fasta = pyfaidx.Fasta(fasta_file)

    genes = list()
    for gene in db.features_of_type("gene"):
        genes.append(" ".join([
            ">" + gene.id, gene.chrom,
            str(gene.start),
            str(gene.end), "\n" + gene.sequence(fasta)
        ]))
    with open(out_file, 'w') as out:
        for gene in genes:
            out.write("%s\n" % gene)
Ejemplo n.º 23
0
    def test_lowercase_ref_splice_site(self):
        reference = pkg_resources.resource_filename("Mikado.tests", "NC_037283.1.fa.gz")
        fasta = pyfaidx.Fasta(reference)
        fai = pysam.FastaFile(reference)

        lines = dict()
        lines["chrom"] = "NC_037283.1"
        lines["strand"] = '-'
        lines["start"] = 200431
        lines["end"] = 204262
        lines["attributes"] = dict()
        lines["tid"], lines["parent"] = "STRG.4616.1", "STRG.4616"
        lines["features"] = dict()
        lines["features"]["exon"] = [(200431, 200919), (201096, 201282), (201446, 201512),
                                     (201776, 203421), (203570, 204262)]

        seq = str(fasta[lines["chrom"]][lines["start"] - 1:lines["end"]])

        logger, listener, logging_queue = self.create_logger("test_example_model")

        res = checking.create_transcript(lines, seq, lines["start"], lines["end"],
                                         logger=logger)
        listener.stop()
        self.assertIsInstance(res, transcripts.TranscriptChecker)
        self.assertEqual(res.attributes["canonical_number"], 4)
Ejemplo n.º 24
0
    def __init__(self, input_path, blacklist_regions=None, bases_order=None):
        """
        Constructs a `Genome` object.
        """
        self.genome = pyfaidx.Fasta(input_path)
        self.chrs = sorted(self.genome.keys())
        self.len_chrs = self._get_len_chrs()
        self._blacklist_tabix = None

        if blacklist_regions == "hg19":
            self._blacklist_tabix = tabix.open(
                pkg_resources.resource_filename(
                    "selene_sdk",
                    "sequences/data/hg19_blacklist_ENCFF001TDO.bed.gz"))
        elif blacklist_regions == "hg38":
            self._blacklist_tabix = tabix.open(
                pkg_resources.resource_filename(
                    "selene_sdk", "sequences/data/hg38.blacklist.bed.gz"))
        elif blacklist_regions is not None:  # user-specified file
            self._blacklist_tabix = tabix.open(blacklist_regions)

        if bases_order is not None:
            bases = [str.upper(b) for b in bases_order]
            self.BASES_ARR = bases
            lc_bases = [str.lower(b) for b in bases]
            self.BASE_TO_INDEX = {
                **{b: ix
                   for (ix, b) in enumerate(bases)},
                **{b: ix
                   for (ix, b) in enumerate(lc_bases)}
            }
            self.INDEX_TO_BASE = {ix: b for (ix, b) in enumerate(bases)}
            self.update_bases_order(bases)
Ejemplo n.º 25
0
def get_fasta_regions(fastaname, threads):
    fasta = pyfaidx.Fasta(args.fasta, key_function=lambda key: key.split()[0])
    total_reference_length = 0
    for chrom in sorted(fasta.keys()):
        total_reference_length += len(fasta[chrom])
    step_length = int(math.ceil(total_reference_length / threads))
    regions = []
    region = []
    region_so_far = 0
    chrom_so_far = 0
    for chrom in sorted(fasta.keys()):
        chrom_length = len(fasta[chrom])
        if chrom_length < 250000:
            continue
        while True:
            if region_so_far + (chrom_length - chrom_so_far) < step_length:
                region.append((chrom, chrom_so_far, chrom_length))
                region_so_far += chrom_length - chrom_so_far
                chrom_so_far = 0
                break
            else:
                region.append((chrom, chrom_so_far,
                               chrom_so_far + step_length - region_so_far))
                regions.append(region)
                region = []
                chrom_so_far += step_length - region_so_far
                region_so_far = 0
    if len(region) > 0:
        if len(regions) == args.threads:
            regions[-1] = regions[-1] + region
        else:
            regions.append(region)
    return regions
Ejemplo n.º 26
0
def calc_all(all_genes, bases_to_exclude, rscu_fh, gerp_fp, genome_fa,
             syn_gerp_out, bed_out):
    """
    Calculates mean gerp score for all Gene objects contained in list
    all_genes and writes values to outfile
    :param all_genes: dict of Gene objects
    :param gerp_fp: path to tabix-indexed gerp file
    :param genome_fa: path to reference genome fasta that has been indexed
        via samtools faidx
    :param outfile: path to output file
    """
    with gzip.open(gerp_fp, 'rt') as gerp_f:
        gerp_header = gerp_f.readline()
        gerp_header = gerp_header.strip().split("\t")

    gerp_tb = tabix.open(gerp_fp)
    genome = pyfaidx.Fasta(genome_fa)

    rscu = read_rscu_f(rscu_fh)

    syn_gerp_out.write("#GENE\tSYN_GERP\n")

    bed_out.write("#CHROM\tPOS\tSTRAND\tGENE\tCDS_POS\tCODON\tRSCU\tGERP\n")

    for gene_obj in all_genes.values():
        gene_obj.calc_syn_gerp(genome, gerp_header, gerp_tb, rscu,
                               bases_to_exclude)
        syn_gerp_out.write("{}\t{}\n".format(gene_obj.gene, \
                                             gene_obj.syn_gerp))
        for line in gene_obj.bed:
            bed_out.write(line)
Ejemplo n.º 27
0
def calculate_refpos():
    trace(1, 'calculating refpos')

    fname = get_FASTA_fromweb(config['ref_fasta_hg38'])

    # assumes format for firs line is >seqname bla bla bla
    seqname = open(fname).read(100).split()[0][1:]
    ref = pyfaidx.Fasta(fname)

    # helper: return False if SNP definition's anc != reference genome's value
    def check(snpname):
        a1 = snpdict[snpname][1]
        a2 = str(
            ref.faidx.fetch(seqname, int(snpdict[snpname][0]),
                            int(snpdict[snpname][0]))).upper()
        if a1 != a2:
            return False
        return True

    snpdict = {}
    snpdef = data_path(os.path.join('cache', config['b38_snp_file']))
    with open(snpdef) as snpfile:
        c = csv.DictReader(snpfile)
        for line in c:
            snpdict[line['Name']] = (line['start'], line['allele_anc'],
                                     line['allele_der'])

    # write out the detected refpos along with its definition
    refpos = data_path(os.path.join('cache', 'refpos-detect.out'))
    with open(refpos, 'w') as fn:
        for snp in snpdict:
            if (snpdict[snp][1] not in ('ins','del')) and \
              (not check(str(snp))) and (int(snpdict[snp][0]) > 1):
                fn.write('{} {}\n'.format(snp, snpdict[snp]))
    return
Ejemplo n.º 28
0
def calc_proteins(sourcebase, reffile, vcffile):
    gfffiles = get_gff_files(sourcebase)
    if vcffile != None:
        print "VCFFile Provided"
        vcfrecords = VariantFile(vcffile)
    else:
        no_vcf = True
        vcfrecords = ''
    print "DEBUG: reffile for pyfaidx is: %s" % reffile
    fasta = pyfaidx.Fasta(reffile)

    for infile in gfffiles:
        total_genes = 0
        total_exons = 0
        total_genes_wrote = 0
        total_exons_wrote = 0
        total_mod = 0
        db = gffutils.create_db(infile, ':memory:')
        for gene in db.features_of_type('gene'):  #still ok??
            total_genes += 1
            seqs, geneseq, num_mod = get_gene_sequences(
                gene, db, vcfrecords, fasta)
            total_exons += len(seqs)
            total_mod += num_mod
            #a=seqs
            # return seqs
            #return geneseq
            total_exons_wrote += write_records(translate_records(seqs),
                                               'exons')
            total_genes_wrote += write_records(translate_records(geneseq),
                                               'genes')
        print "processed (and wrote): %s with %i(%i) genes and %i(%i) exons (%i modified))" % (
            infile, total_genes, total_genes_wrote, total_exons,
            total_exons_wrote, total_mod)
Ejemplo n.º 29
0
def main():

    def check_type(string):
        if os.path.exists(string) and not os.path.isdir(string): return open(string)
        else: return set(string.split(','))

    parser=argparse.ArgumentParser(description='A simple script that retrieves the FASTA sequences from a file given a list of ids.')
    parser.add_argument("-v", "--reverse", action="store_true", default=False, help="Retrieve entries which are not in the list, as in grep -v (a homage).")
    parser.add_argument('list', type=check_type, help='File with the list of the ids to recover, one by line. Alternatively, names separated by commas.')
    parser.add_argument('fasta', type=argparse.FileType('r'), help='FASTA file.')
    parser.add_argument('out', type=argparse.FileType('w'), help='Optional output file.', nargs='?', default=sys.stdout)
    args=parser.parse_args()

    if isinstance(args.list, IOBase):
        ids = set([line.rstrip() for line in args.list.readlines()])
    else:
        ids=args.list

    args.fasta.close()
    fasta = pyfaidx.Fasta(args.fasta.name)

    for name in ids:
        assert name in fasta
        print(">{0}".format(name), file=args.out)
        print(*textwrap.wrap(str(fasta[name]), width=60),
              sep="\n", file=args.out)
Ejemplo n.º 30
0
    def __load_configuration(self):

        """Private method to load the configuration"""

        if isinstance(self.configuration, str):
            assert os.path.exists(self.configuration)
            self.configuration = load_and_validate_config(self.configuration, logger=self.logger)
            assert isinstance(self.configuration, (MikadoConfiguration, DaijinConfiguration))
            # pylint: disable=no-member
        elif not isinstance(self.configuration, (MikadoConfiguration,DaijinConfiguration)):
            raise TypeError(type(self.configuration))

        multiprocessing.set_start_method(self.configuration.multiprocessing_method,
                                         force=True)
        self.input_file = self.configuration.pick.files.input
        self.setup_logger()

        if self.configuration.pick.alternative_splicing.pad is True:
            # Check that, when asks for padding, the reference genome is present
            self.logger.debug("Checking for the presence of the reference genome")
            try:
                _ = pyfaidx.Fasta(self.configuration.reference.genome)
            except (pyfaidx.FastaIndexingError, FileNotFoundError, pyfaidx.FastaNotFoundError):
                self.logger.error("Transcript padding cannot be executed without a valid genome file.\
                 Please, either disable the padding or provide a valid genome sequence.")
                sys.exit(1)
            self.logger.debug("Valid reference genome found")
        else:
            pass

        self.context = multiprocessing.get_context()
        self.logger.debug("Configuration loaded successfully")