Ejemplo n.º 1
0
def chunk_fastq_file(fastq_filename, new_filename, parse_rec):
    """
    Create a new FASTQ file from an existing one.

    :param str fastq_filename: the name of the original BAM file
    :param str new_filename: the name of the new BAM file
    :param class:`ParseRecord` parse_rec: the information containing where to extract
    :return:
    """
    try:
        os.remove(new_filename)
    except Exception as e:
        pass

    # copy the header from original BAM file to new
    bytes_from_file(fastq_filename, new_filename, 0, parse_rec.header_size)

    if parse_rec.begin_read_offset > 0:
        # if there are reads before a chunk offset, we need to extract them
        b = bgzf.BgzfReader(fastq_filename)
        b2 = bgzf.BgzfWriter(new_filename, mode="a")
        b.seek(parse_rec.begin_read_offset)
        b2.write(b.read(parse_rec.begin_read_size))
        b2.close()

    # grab bgzf chunks from the OLD FASTQ file and append to NEW FASTQ file
    bytes_from_file(fastq_filename, new_filename, parse_rec.file_offset, parse_rec.file_bytes)

    if parse_rec.end_read_offset > 0:
        # if there are reads after a chunk offset, we need to extract them
        b = bgzf.BgzfReader(fastq_filename)
        b2 = bgzf.BgzfWriter(new_filename, mode="a")
        b.seek(parse_rec.end_read_offset)
        b2.write(b.read(parse_rec.end_read_size))
        b2.close()
Ejemplo n.º 2
0
def filter_pe_fasq_by_len(fq_1, fq_2, minlen, prefix):
    '''filter pe reads by min length'''
    fq_1_ = prefix + ".gt" + str(minlen) + ".1.fq.gz"
    fq_2_ = prefix + ".gt" + str(minlen) + ".2.fq.gz"
    with bgzf.BgzfWriter(fq_1_, 'wb') as out_1, bgzf.BgzfWriter(fq_2_, 'wb') as out_2:
        with gzip.open(fq_1, 'rt') as in_1, gzip.open(fq_2, 'rt') as in_2:
            for rec_a, rec_b in zip(SeqIO.parse(in_1, 'fastq'), SeqIO.parse(in_2, 'fastq')):
                if (len(rec_a.seq) > minlen) and (len(rec_b.seq) > minlen):
                    SeqIO.write(rec_a, out_1, 'fastq')
                    SeqIO.write(rec_b, out_2, 'fastq')
Ejemplo n.º 3
0
 def __call__(self):
     global F_Flag
     F_Flag = self.Fa_Flag
     FastQFlag = self.fqflag
     read_lists = []
     MinTime = time.time()
     FailedReads = 0
     ChrList = [[] for _ in range(512)]
     global Visits
     Visits = []
     ChannelDict = {str(il): () for il in range(1, 513)}
     Outdata = []
     global f5
     f5 = h5py.File(self.multi_fast5, 'r')
     f5.visit(Visits.append)
     reads_list_to_read = f5.keys()
     RefStart = time.mktime(
         par.parse(f5[str(reads_list_to_read[0]) +
                      '/tracking_id'].attrs['exp_start_time'].decode(
                          'UTF-8')).timetuple())
     for r in reads_list_to_read:
         res = self.get_content(r, FastQFlag)
         Outdata.append(res)
     if FastQFlag == True:
         file_out = os.path.join(self.t_dir,
                                 'tmp.' + str(self.Norder) + '.fastq.gz')
         Gzout = yielder(read_lists)
         with bgzf.BgzfWriter(file_out, "wb") as outgz:
             SeqIO.write(sequences=Gzout, handle=outgz, format="fastq")
     Outdata.append(RefStart)
     return Outdata
Ejemplo n.º 4
0
    def output_vcf_population(self, control_size, test_size, male_odds, compression_level):
        """
        Output a population .vcf file and companion .fam file.
        :param compression_level: level of gzip compression (1-9)
        :param test_size: size of control group
        :param control_size: size of cases/test group
        :param male_odds: odds of a person being a biological male
        :return:
        """

        if not self.ordered_snps:
            raise Exception("No SNPs to Process! Exiting.")
        # pick deleterious groups for population size
        deleterious_group_list = PopulationFactory.pick_deleterious_groups(list(self.deleterious.values()), test_size)

        fam_data = self.generate_fam_file(control_size, test_size, male_odds, deleterious_group_list)
        main_file = self.population_dir + "population.vcf.gz"
        CHUNK_SIZE = 500000  # Defines work chunks that have a sync point after each one. Helps with memory issues.
        with bgzf.BgzfWriter(filename=main_file, mode='wt+', compresslevel=compression_level) as f:
            header = gen_vcf_header(fam_data)
            f.write(header)
            print("Outputing VCF lines", flush=True)
            chunks = int(len(self.ordered_snps) / CHUNK_SIZE)
            if chunks < 1:
                chunks = 1
            for i, snps_list in enumerate(split_list(self.ordered_snps, chunks)):
                self.write_vcf_snps(fam_data, snps_list, f)
                print("%s Finished work chunk %i of %i." %
                      (datetime.now().strftime("%Y-%m-%d %H:%M"), i + 1, chunks), flush=True)

        print("Finished VCF file output.", flush=True)
Ejemplo n.º 5
0
 def __call__(self):
     file_out = os.path.join(self.t_dir,
                             'tmp.' + str(self.Norder) + '.fastq.gz')
     Gzout = get_content(self.datas)
     with bgzf.BgzfWriter(file_out, "wb") as outgz:
         SeqIO.write(sequences=Gzout, handle=outgz, format="fastq")
     return file_out
Ejemplo n.º 6
0
    def __init__(self, path, mode='r'):
        """
        Store tabular information tied to genomic locations in a bgzipped file
        Args:
            path (str) : path to file
            mode (str) : mode, r: read, w: write
        """
        self.path = path
        self.index_path = f'{path}.idx'
        self.prev_contig = None
        self.mode = mode
        self.index = {}

        if self.mode == 'w':
            self.bgzf_handle = bgzf.BgzfWriter(self.path, 'w')
            self.index_handle = open(self.index_path, 'wt')
        elif self.mode == 'r':
            if not os.path.exists(self.path):
                raise ValueError(f'BGZIP index file missing at {self.path}')
            self.bgzf_handle = bgzf.BgzfReader(self.path, 'rt')
            if not os.path.exists(self.index_path):
                raise ValueError(
                    f'BGZIP index file missing at {self.index_path}')
            self.index_handle = open(self.index_path, 'rt')

            for line in self.index_handle:
                contig, start = line.strip().split()
                self.index[contig] = int(start)
        else:
            raise ValueError('Mode can be r or w')
        self.cache = {}
Ejemplo n.º 7
0
def filter(logger, source_file, target_file, output_prefix, method):
    check_file_exists(source_file)
    check_file_exists(target_file)

    check_tool_exists("bedtools")
    check_tool_exists("tabix")

    tmp_bed = output_prefix + ".tmp.bed"

    method_command = "bedtools %s -a \"%s\" -b \"%s\" > %s" % (
        method, source_file, target_file, tmp_bed)
    runCmd(method_command, logger)

    if not os.path.isfile(tmp_bed):
        raise Exception("bedtools failed, no output file generated.")

    tmp_file = output_prefix + ".tmp.bed.bgz"
    logger.info("Writing dinucleotide to " + tmp_file + " ...")
    with bgzf.BgzfWriter(tmp_file, "wb") as fout:
        with open(tmp_bed, "rt") as fin:
            for line in fin:
                fout.write(line)
    os.remove(tmp_bed)

    output_file = output_prefix + ".bed.bgz"
    if os.path.exists(output_file):
        os.remove(output_file)
    os.rename(tmp_file, output_file)
    runCmd("tabix -p bed %s " % output_file, logger)

    count_file = output_prefix + ".count"
    dinucleotide_to_count(logger, output_file, count_file)

    logger.info("done.")
Ejemplo n.º 8
0
def main(args):
    if args.file:
        with open(args.file[0]) as f:
            data = f.read()
    elif not sys.stdin.isatty():
        data = sys.stdin.read()
    else:
        raise ValueError('No input data detected')

    w = bgzf.BgzfWriter(fileobj=sys.stdout.buffer)
    w.write(data)
    w.close()
Ejemplo n.º 9
0
def exract_diff_seq(diff_genes, fasta):
    gene_dict = {each.strip(): 1 for each in open(diff_genes)}
    seq_list = []
    for seq_record in SeqIO.parse(fasta, "fasta"):
        gene_id = re.search('gene=(\S+)', seq_record.description).groups()[0]
        if gene_id in gene_dict:
            seq_list.append(seq_record)

    d_path, d_name = os.path.split(os.path.abspath(diff_genes))
    d_prefix = os.path.splitext(d_name)[0]
    d_fa_path = os.path.join(d_path, '{p}.fa.gz'.format(p=d_prefix))
    with bgzf.BgzfWriter(d_fa_path, "wb") as outgz:
        SeqIO.write(sequences=seq_list, handle=outgz, format="fasta")
Ejemplo n.º 10
0
    def __init__(self, path, mode='r', read_all=False):
        """
        Store tabular information tied to genomic locations in a bgzipped file
        Args:
            path (str) : path to file
            mode (str) : mode, r: read, w: write

            read_all(bool) : when enabled all data is read from the file and the handles are closed
        """
        self.path = path
        self.index_path = f'{path}.idx'
        self.prev_contig = None
        self.mode = mode
        self.index = {}
        self.cache = {}

        if self.mode == 'w':
            self.bgzf_handle = bgzf.BgzfWriter(self.path, 'w')
            self.index_handle = open(self.index_path, 'wt')
        elif self.mode == 'r':
            if not os.path.exists(self.path):
                raise ValueError(f'BGZIP index file missing at {self.path}')
            self.bgzf_handle = bgzf.BgzfReader(self.path, 'rt')
            if not os.path.exists(self.index_path):
                raise ValueError(
                    f'BGZIP index file missing at {self.index_path}')
            self.index_handle = open(self.index_path, 'rt')

            for line in self.index_handle:
                contig, start = line.strip().split()
                self.index[contig] = int(start)

            if read_all:

                for line in self.bgzf_handle:
                    if len(line) == 0:
                        continue
                    line_contig, line_pos, line_strand, rest = self.read_file_line(
                        line)
                    #print((line_pos, line_strand,rest))
                    if not line_contig in self.cache:
                        self.cache[line_contig] = {}
                    self.cache[line_contig][(line_pos, line_strand)] = rest
                    cpos = line_pos
                self.bgzf_handle.close()
                self.bgzf_handle = None
                self.index_handle.close()
                self.index_handle = None

        else:
            raise ValueError('Mode can be r or w')
Ejemplo n.º 11
0
 def sideEffect(self, filename, *args, **kwargs):
     if self.count <= 1:
         self.test.assertEqual('filename.fasta.bgz', filename)
         self.count += 1
         writerIO = BytesIO()
         writer = bgzf.BgzfWriter(fileobj=writerIO)
         writer.write(b'>id0\nAC\n')
         writer.flush()
         fileobj = BytesIO(writerIO.getvalue())
         fileobj.mode = 'rb'
         return bgzf.BgzfReader(fileobj=fileobj)
     else:
         self.test.fail(
             'Open called too many times. Filename: %r, Args: %r, '
             'Keyword args: %r.' % (filename, args, kwargs))
Ejemplo n.º 12
0
def main(vcf_input, output, remove_fields=[], keep_fields=[]):
    '''
        Remove INFO fields from VCF.

        Args:
            vcf_input:  input VCF file

            output:     VCF output file. Will write to STDOUT if not
                        provided.

            remove_fields:
                        One or more INFO fields to remove. Can not be
                        used in conjunction with keep_fields argument.

            keep_fields:
                        One or more INFO fields to keep. All other INFO
                        fields defined in the VCF header will be removed.
                        Can not be used in conjunction with remove_fields
                        argument.

    '''
    if remove_fields and keep_fields:
        raise RuntimeError("remove_fields and keep_fields arguments are " +
                           "mutually exclusive.")
    vcf = VcfReader(vcf_input)
    if output is None:
        vcf_writer = sys.stdout
    elif output.endswith(('.gz', '.bgz')):
        from Bio import bgzf
        vcf_writer = bgzf.BgzfWriter(output)
    else:
        vcf_writer = open(output, 'w')
    new_head = []
    if keep_fields:
        remove_fields = [
            x for x in vcf.header.metadata['INFO'].keys()
            if x not in keep_fields
        ]
    for h in vcf.header.meta_header:
        match = info_re.match(h)
        if not match or match.group(1) not in remove_fields:
            new_head.append(h)
    vcf_writer.write("\n".join(new_head) + "\n")
    vcf_writer.write("\t".join(vcf.col_header) + "\n")
    for record in vcf:
        record.remove_info_fields(remove_fields)
        vcf_writer.write(str(record) + "\n")
    vcf_writer.close()
Ejemplo n.º 13
0
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz):
    if in_gz:
        in_h = gzip.open(fa_in, 'rt')
    else:
        in_h = open(fa_in, 'r')
    if gz:
        out_h = bgzf.BgzfWriter(fa_out, 'wb')
    else:
        out_h = open(fa_out, 'w')
    writer = FastaWriter(out_h)
    writer.write_header()
    for rec in FastaIterator(in_h, title2ids=header_function):
        writer.write_record(rec)
    writer.write_footer()
    out_h.close()
    in_h.close()
Ejemplo n.º 14
0
    def rewrite(self, compressed_input_file, output_file):
        h = gzip.open(compressed_input_file, "rb")
        data = h.read()
        h.close()

        h = bgzf.BgzfWriter(output_file, "wb")
        h.write(data)
        h.close()  #Gives empty BGZF block as BAM EOF marker

        h = gzip.open(output_file)
        new_data = h.read()
        h.close()

        #Check the decompressed files agree
        self.assert_(new_data, "Empty BGZF file?")
        self.assertEqual(len(data), len(new_data))
        self.assertEqual(data, new_data)
Ejemplo n.º 15
0
def fasta2dinucleotide(logger,
                       fasta_file,
                       bed_file,
                       output_prefix,
                       is_test=False):
    check_file_exists(fasta_file)
    check_file_exists(bed_file)

    regions = read_coordinate_file(bed_file, "region", checkOverlap=True)
    chromRegionMap = {}
    for region in regions:
        chromRegionMap.setdefault(region.reference_name, []).append(region)

    tmp_file = output_prefix + ".tmp.bed.bgz"
    with bgzf.BgzfWriter(tmp_file, "wb") as fout:
        with open(fasta_file, "rt") as fin:
            for record in SeqIO.parse(fin, 'fasta'):
                id = record.id
                if id not in chromRegionMap:
                    continue

                logger.info("Extracting dinucleotide of " + id + " ...")

                seq = str(record.seq)
                catItems = chromRegionMap[id]
                for ci in catItems:
                    catSeq = seq[ci.reference_start:ci.reference_end].upper()
                    if ci.strand == '-':
                        catSeq = str(Seq(catSeq).reverse_complement())
                    for si in range(0, len(catSeq) - 2):
                        dinu = catSeq[si:(si + 2)].upper()
                        fout.write(
                            "%s\t%d\t%d\t%s\t%d\t%s\n" %
                            (id, ci.reference_start + si,
                             ci.reference_start + si + 2, dinu, 1, ci.strand))

    output_file = output_prefix + ".bed.bgz"
    if os.path.exists(output_file):
        os.remove(output_file)
    os.rename(tmp_file, output_file)
    runCmd("tabix -p bed %s " % output_file, logger)

    count_file = output_prefix + ".count"
    dinucleotide_to_count(logger, output_file, count_file)

    logger.info("done.")
Ejemplo n.º 16
0
def barcode_to_tag(input_file, output_file, barcode, verbose):
	samfile = pysam.AlignmentFile(input_file, "rb")
	header = str(samfile.header)

	barcode_length = len(regex.findall('B', barcode))
	umi_length = len(regex.findall('U', barcode))
	barcode_pattern = '('
	if barcode_length > 0:
		barcode_pattern += '(.?)BC:Z:[A-Z]{' + str(barcode_length) + '}'
	if umi_length > 0:
		barcode_pattern += '(.?)RX:Z:[A-Z]{' + str(umi_length) + '}'
	barcode_pattern += ')'

	total = 0
	wrote = 0
	dirname, basename = os.path.split(input_file)
	with tempfile.TemporaryFile(prefix=basename, dir=dirname) as tmp:
		with pysam.AlignmentFile(input_file, "rb") as infile:
			for read in infile:
				name_list = regex.split(barcode_pattern, read.query_name)
				if name_list[0] == '':
					name_list[2] = name_list[2].replace('.', '', 1)
				tags = regex.match(barcode_pattern, read.query_name).group().replace('.', '', 1)
				read.query_name = ''.join([name_list[0], name_list[2]])
				# tags = tuple(name_list[1].replace('.', ':Z:').split(':Z:'))[1:]
				# read.tags = read.tags + [ tags[x:x + 2] for x in range(0, len(tags), 2) ]
				tags = tuple(tags.replace('.', ':Z:').split(':Z:'))
				read.tags = read.tags + [ tags[x:x + 2] for x in range(0, len(tags), 2) ]
				tmp.write((pysam.AlignedSegment.to_string(read)+'\n').encode('utf8'))
				total += 1
		tmp.seek(0)
		with bgzf.BgzfWriter(output_file, "wb") as outfile:
			outfile.write(header.encode('utf8'))
			for read in tmp:
				outfile.write(read)
				wrote += 1

	if verbose:
		print(total, "entries read from the input file.")
		print(wrote, "entries written to the output file.")


	return 0
Ejemplo n.º 17
0
    def rewrite(self, compressed_input_file, output_file):
        with gzip.open(compressed_input_file, "rb") as h:
            data = h.read()

        with bgzf.BgzfWriter(output_file, "wb") as h:
            h.write(data)
            self.assertFalse(h.seekable())
            self.assertFalse(h.isatty())
            self.assertEqual(h.fileno(), h._handle.fileno())
        # Context manager should call close(),
        # Gives empty BGZF block as BAM EOF marker

        with gzip.open(output_file) as h:
            new_data = h.read()

        # Check the decompressed files agree
        self.assertTrue(new_data, "Empty BGZF file?")
        self.assertEqual(len(data), len(new_data))
        self.assertEqual(data, new_data)
Ejemplo n.º 18
0
    def rewrite(self, compressed_input_file, output_file):
        h = gzip.open(compressed_input_file, "rb")
        data = h.read()
        h.close()

        h = bgzf.BgzfWriter(output_file, "wb")
        h.write(data)
        self.assertFalse(h.seekable())
        self.assertFalse(h.isatty())
        self.assertEqual(h.fileno(), h._handle.fileno())
        h.close()  # Gives empty BGZF block as BAM EOF marker

        h = gzip.open(output_file)
        new_data = h.read()
        h.close()

        #Check the decompressed files agree
        self.assertTrue(new_data, "Empty BGZF file?")
        self.assertEqual(len(data), len(new_data))
        self.assertEqual(data, new_data)
Ejemplo n.º 19
0
    def __call__(self, string):
        # the special argument "-" means sys.std{in,out}
        if string == '-':
            if 'r' in self._mode:
                return sys.stdin
            elif 'w' in self._mode:
                return sys.stdout
            else:
                raise ValueError('argument "-" with mode %r' % self._mode)

        # all other arguments are used as file names
        try:
            if string[-3:] == ".gz":
                from Bio import bgzf
                if 'r' in self._mode:
                    return bgzf.BgzfReader(string, self._mode)
                elif 'w' in self._mode or 'a' in self._mode:
                    return bgzf.BgzfWriter(string, self._mode)
            else:
                return open(string, self._mode, self._bufsize)
        except OSError as e:
            raise ArgumentTypeError("can't open '%s': %s" % (string, e))
Ejemplo n.º 20
0
def gzip_speed():

    gzip_pipe = subprocess.Popen(args="gzip -c > tmp_file.gz", shell=True, stdin=subprocess.PIPE)
    randos = []
    for i in range(20000):
        rands = numpy.random.rand(300)
        string = " ".join(map(lambda x: str(x), rands))
        randos.append(string)

    with timer.Timer(logger=print, name="OS GZip") as t:

        for r in randos:
            gzip_pipe.stdin.write(r.encode())
        gzip_pipe.stdin.close()
        gzip_pipe.wait()

    with timer.Timer(logger=print, name="GZipLib") as t, gzip.open("tmp2_file.gz", 'wt+', compresslevel=4) as f:
        for r in randos:
            f.write(r)

    with timer.Timer(logger=print, name="BGZipLib") as t, bgzf.BgzfWriter("tmp3_file.gz", 'wt+', compresslevel=4) as f:
        for r in randos:
            f.write(r)
Ejemplo n.º 21
0
def phase_segment(out_filename, res, tumor_recs_used_reg, de_ads):
    with bgzf.BgzfWriter(out_filename) as fout:
        for info, ads in zip(tumor_recs_used_reg[res.phased],
                             de_ads[res.phased]):
            segment_first = True
            first_max = -1
            for rec, ad in zip(info, ads):
                line = f'{rec[0]}\t{rec[1]}\t{rec[2]}\t{rec[3]}\t{rec[4]}\t{rec[5]}\t{ad[0]}\t{ad[1]}'
                if first_max < 0:
                    if ad[0] > ad[1]:
                        first_max = 0
                    else:
                        first_max = 1
                if segment_first:
                    if first_max == 0:
                        if ad[0] > ad[1]:
                            line += '\t0/1'
                        else:
                            line += '\t1/0'
                    else:
                        if ad[0] > ad[1]:
                            line += '\t1/0'
                        else:
                            line += '\t0/1'
                    segment_first = False
                else:
                    if first_max == 0:
                        if ad[0] > ad[1]:
                            line += '\t0|1'
                        else:
                            line += '\t1|0'
                    else:
                        if ad[0] > ad[1]:
                            line += '\t1|0'
                        else:
                            line += '\t0|1'
                fout.write(line + '\n')
Ejemplo n.º 22
0
def bam2dinucleotide(logger,
                     bamFile,
                     output_prefix,
                     genomeFastaFile,
                     mappingQuality=20,
                     uniqueOnly=False,
                     minCoverage=1,
                     isTest=False):
    check_file_exists(bamFile)
    check_file_exists(genomeFastaFile)

    logger.info("reading bam file %s ..." % bamFile)
    dinuItems = []
    count = 0
    with pysam.AlignmentFile(bamFile, "rb") as sf:
        for s in sf.fetch():
            count = count + 1
            if count % 1000000 == 0:
                logger.info(count)

                if isTest:
                    break

            if s.is_unmapped:
                continue

            if s.is_paired and (not s.is_read1):
                continue

            if s.mapping_quality < mappingQuality:
                continue

            if uniqueOnly:
                isUnique = True
                for tag in s.tags:
                    if tag[0] == 'XS':
                        isUnique = False
                        break
                if not isUnique:
                    continue

            if s.is_reverse:
                dinuItems.append(
                    DinucleotideItem(s.reference_name, s.reference_end,
                                     s.reference_end + 2, s.query_name,
                                     s.mapping_quality, "-", ""))
            else:
                dinuItems.append(
                    DinucleotideItem(s.reference_name, s.reference_start - 2,
                                     s.reference_start, s.query_name,
                                     s.mapping_quality, "+", ""))

    chrDinuMap = OrderedDict()
    for di in dinuItems:
        chrDinuMap.setdefault(di.reference_name, []).append(di)

    for chr in chrDinuMap.keys():
        values = chrDinuMap[chr]
        logger.info("sort %d dinucleotides in chromosome %s..." %
                    (len(values), chr))
        values.sort(key=get_reference_start)
        logger.info("combine %d dinucleotides in chromosome %s..." %
                    (len(values), chr))
        idx = len(values) - 1
        deleteList = set()
        while (idx > 0):
            curDinu = values[idx]
            prev = idx - 1
            while (prev >= 0):
                prevDinu = values[prev]
                if curDinu.reference_start != prevDinu.reference_start:
                    break
                if curDinu.strand == prevDinu.strand:
                    prevDinu.count = prevDinu.count + curDinu.count
                    deleteList.add(idx)
                    break
                prev = prev - 1
            idx = idx - 1
        chrDinuMap[chr] = [
            i for j, i in enumerate(values) if j not in deleteList
        ]
        logger.info(
            "after combine, there is %d dinucleotides in chromosome %s..." %
            (len(chrDinuMap[chr]), chr))

    if minCoverage > 1:
        for chr in chrDinuMap.keys():
            values = chrDinuMap[chr]
            chrDinuMap[chr] = [v for v in values if v.count >= minCoverage]

    with open(genomeFastaFile, "rt") as fin:
        for record in SeqIO.parse(fin, 'fasta'):
            id = record.id
            logger.info("Filling dinucleotide of " + id + " ...")

            if id in chrDinuMap.keys():
                seq = str(record.seq)
                seqlen = len(seq)
                chrDinuItems = chrDinuMap[id]
                for di in chrDinuItems:
                    if di.reference_start >= 0 and di.reference_end <= seqlen:
                        dinu = seq[di.reference_start:di.reference_end].upper()
                        if di.strand == "+":
                            dinu = str(Seq(dinu).reverse_complement())
                        di.dinucleotide = dinu

    tmp_file = output_prefix + ".tmp.bed.bgz"
    logger.info("Writing dinucleotide to " + tmp_file + " ...")
    with bgzf.BgzfWriter(tmp_file, "wb") as fout:
        for chrom in chrDinuMap.keys():
            diList = chrDinuMap[chrom]
            for s in diList:
                if (s.dinucleotide != "") and (not 'N' in s.dinucleotide):
                    fout.write(
                        "%s\t%d\t%d\t%s\t%d\t%s\n" %
                        (s.reference_name, s.reference_start, s.reference_end,
                         s.dinucleotide, s.count, s.strand))

    output_file = output_prefix + ".bed.bgz"
    if os.path.exists(output_file):
        os.remove(output_file)
    os.rename(tmp_file, output_file)
    runCmd("tabix -p bed %s " % output_file, logger)

    count_file = output_prefix + ".count"
    dinucleotide_to_count(logger, output_file, count_file)

    logger.info("done.")
Ejemplo n.º 23
0
try:
    allele_info = open(my_parser().allele, "r")
except FileNotFoundError:
    bomb(f"Missing argument or '{my_parser().allele}' may be empty\n")

toto = my_parser().type_

if not my_parser().out:
    bomb('Missing argument, "-o PREFIX", "--out PREFIX"\n'
         'Error: run `./snprecode -h` for complete arguments list '
         'required to recode from FImpute\n')

fo = my_parser().out
# open headers
if toto == 1:
    geno_out = bgzf.BgzfWriter(fo + ".vcf.gz", "wb")
    # write header
    geno_out.write("".join('''##fileformat=VCFv4.2
##filedate=%s
##source="snprecode v1.0.4"
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
''' % (datetime.today().strftime('%Y%m%d'))))
elif toto == 2:
    geno_out = open(fo + ".ped", "w")
else:
    print(
        f'\nError!: Missing argument. Specify the recode type: `-t 1` (for VCF) or `-t 2` (for PED/MAP)]'
    )
    print(
        'run `./snprecode -h` for complete arguments list required to recode from FImpute\n'
    )
Ejemplo n.º 24
0
import gzip
import argparse

parser = argparse.ArgumentParser(prog='fastq_trim_umi')
parser.add_argument('-i', help='input fastq.gz file', dest='input_fastq_gz')
parser.add_argument('-o', help='output fastq.gz file', dest='output_fastq_gz')
parser.add_argument('-l',
                    help='length of the UMI barcode',
                    dest='umi_len',
                    default=12,
                    type=int)
args = parser.parse_args()

ifilename = args.input_fastq_gz
ofilename = args.output_fastq_gz
umilen = args.umi_len

with gzip.open(ifilename, "r") as handle, bgzf.BgzfWriter(ofilename,
                                                          "wb") as fout:
    for rec in SeqIO.parse(handle, "fastq"):
        umi = rec.seq[0:umilen]

        rid = rec.description + ":" + str(umi)
        rseq = Seq(str(rec.seq)[umilen:], SingleLetterAlphabet)
        rq = rec.letter_annotations["phred_quality"][umilen:]

        nrec = SeqRecord(rseq, id=rid, description="")
        nrec.letter_annotations["phred_quality"] = rq

        SeqIO.write(sequences=nrec, handle=fout, format="fastq")
Ejemplo n.º 25
0
args = parser.parse_args()
if DEBUG:
    args.input = "T:/Shared/Labs/Linton Lab/20180913_linton_exomeseq_2118_human_cutadapt/bwa_refine_hc_gvcf_hardfilter/result/linton_exomeseq_2118.pass.vcf"
    args.output = "T:/Shared/Labs/Linton Lab/20180913_linton_exomeseq_2118_human_cutadapt/bwa_refine_hc_gvcf_hardfilter_vep/result/linton_exomeseq_2118.pass.filtered.vcf"

percentage = float(args.percentage)
frequency = float(args.frequency)

logger = initialize_logger(args.output + ".log", 'filterVcf', args.debug)
logger.info(str(args))

basename = os.path.splitext(args.output)[0]

if args.output.endswith(".gz"):
    outputTemp = basename + ".tmp.gz"
    fout = bgzf.BgzfWriter(outputTemp, "wb")
    fdiscard = bgzf.BgzfWriter(basename + ".discard.gz", "wb")
else:
    outputTemp = basename + ".tmp"
    fout = open(outputTemp, "wt")
    fdiscard = open(basename + ".discard", "wt")

if args.input.endswith(".gz"):
    if is_version_2():
        fin = gzip.open(args.input, 'rb')
    else:
        fin = gzip.open(args.input, 'rt')
else:
    fin = open(args.input, "r")

with fout:
Ejemplo n.º 26
0
    def __call__(self):
        FastQFlag = self.flag
        datas = self.datas
        read_lists = []
        MinTime = time.time()
        FailedReads = 0
        ChrList = []
        try:
            file_path_check = next(i for i in datas if 'pass' in i)
        except:
            try:
                file_path_check = next(i for i in datas if 'fail' in i)
            except:
                file_path_check = datas[0]
        p_check = Pathcheck(file_path_check)
        for ds in datas:
            try:
                res = get_content(ds, FastQFlag, p_check)
                ch = res[0][0]
                mu = res[0][1]
                ChrList.append(res[0][1:])
                NewTime = float(res[0][2])
                if NewTime < MinTime:
                    MinTime = NewTime
                if FastQFlag == True:
                    if float(res[0][4]) >= 7.0:
                        read_lists.append(res[1])
            except:
                del ds
        ChrList[:] = [
            ChrList[z] for z in (y[0]
                                 for y in sorted(enumerate(zip(*ChrList)[1]),
                                                 key=lambda z: z[1]))
        ]
        TimeVec = map(
            lambda x: int(
                time.strftime(
                    '%H', time.localtime(float(x) - float(self.RefStart)))),
            zip(*ChrList)[1])
        #print MinTime, str(self.RefStart)
        #print ChrList
        #print TimeVec
        for e in range(len(TimeVec)):
            hour = TimeVec[e]
            if e > 1:
                if hour < TimeVec[e - 1]:
                    TimeVec[e] = hour + 24
        ChrList = zip(TimeVec, list(zip(*ChrList)[2]), list(zip(*ChrList)[3]),
                      list(zip(*ChrList)[0]), list(zip(*ChrList)[4]))
        ReadPerChannel = len(ChrList)
        BasesPerChannel = sum(map(int, zip(*ChrList)[1]))
        MuxProductivity = {str(il): [] for il in range(1, 5)}
        for k in MuxProductivity.keys():
            MuxProd = {}
            MuxBase = {}
            MucList = map(
                lambda y: ChrList[y],
                [i for i, x in enumerate(zip(*ChrList)[3]) if x == k])
            for hr, val, qual, muc, gcs in MucList:
                if float(qual) < 7.0:
                    FailedReads += 1
                if str(hr) in MuxProd:
                    MuxProd[str(hr)] += int(val)
                    MuxBase[str(hr)] += 1

                else:
                    MuxProd[str(hr)] = int(val)
                    MuxBase[str(hr)] = 1
            MuxProductivity[k].append(
                dict(
                    zip(MuxBase.keys(), zip(MuxBase.values(),
                                            MuxProd.values()))))
        if FastQFlag == True:
            file_out = os.path.join(self.t_dir,
                                    'tmp.' + str(self.Norder) + '.fastq.gz')
            Gzout = yielder(read_lists)
            with bgzf.BgzfWriter(file_out, "wb") as outgz:
                SeqIO.write(sequences=Gzout, handle=outgz, format="fastq")
        ObjectOut = []
        ObjectOut.append(str(self.Norder))
        ObjectOut.append(str(ReadPerChannel))
        ObjectOut.append(str(BasesPerChannel))
        ObjectOut.append(str(FailedReads))
        ObjectOut.append(ChrList)
        ObjectOut.append(MuxProductivity)
        return ObjectOut
Ejemplo n.º 27
0
import sys
import re
import os
import gzip

from Bio import SeqIO, bgzf
from Bio.Seq import reverse_complement

rev_com = []
output = "rev_com_" + sys.argv[1]
with gzip.open(sys.argv[1], "rt") as handle:
    for my_seq in SeqIO.parse(handle, "fastq"):
        rev_com.append(
            my_seq.reverse_complement(id=my_seq.id,
                                      description=my_seq.description))
with bgzf.BgzfWriter(output, "wb") as outgz:
    SeqIO.write(rev_com, outgz, "fastq")
Ejemplo n.º 28
0
    def __init__(self,
                 out_prefix,
                 paired=False,
                 bam_header=None,
                 vcf_header=None,
                 no_fastq=False,
                 fasta_instead=False):

        self.fasta_instead = fasta_instead
        # TODO Eliminate paired end as an option for fastas. Plan is to create a write fasta method.
        if self.fasta_instead:
            fq1 = pathlib.Path(out_prefix + '.fasta.gz')
            fq2 = None
        else:
            fq1 = pathlib.Path(out_prefix + '_read1.fq.gz')
            fq2 = pathlib.Path(out_prefix + '_read2.fq.gz')
        bam = pathlib.Path(out_prefix + '_golden.bam')
        vcf = pathlib.Path(out_prefix + '_golden.vcf.gz')

        # TODO Make a fasta-specific method
        self.no_fastq = no_fastq
        if not self.no_fastq:
            self.fq1_file = bgzf.open(fq1, 'w')

            self.fq2_file = None
            if paired:
                self.fq2_file = bgzf.open(fq2, 'w')

        # VCF OUTPUT
        self.vcf_file = None
        if vcf_header is not None:
            self.vcf_file = bgzf.open(vcf, 'wb')

            # WRITE VCF HEADER
            self.vcf_file.write('##fileformat=VCFv4.1\n'.encode('utf-8'))
            reference = '##reference=' + vcf_header[0] + '\n'
            self.vcf_file.write(reference.encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=VMX,Number=1,Type=String,Description="SNP is Missense in these Read Frames">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=VNX,Number=1,Type=String,Description="SNP is Nonsense in these Read Frames">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=VFX,Number=1,Type=String,Description="Indel Causes Frameshift">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=WP,Number=A,Type=Integer,Description="NEAT-GenReads ploidy indicator">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=DEL,Description="Deletion">\n'.encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=DUP,Description="Duplication">\n'.encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=INS,Description="Insertion of novel sequence">\n'.
                encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=INV,Description="Inversion">\n'.encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=CNV,Description="Copy number variable region">\n'.
                encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=TRANS,Description="Translocation">\n'.encode(
                    'utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=INV-TRANS,Description="Inverted translocation">\n'.
                encode('utf-8'))
            # TODO add sample to vcf output
            self.vcf_file.write(
                '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'.encode(
                    'utf-8'))

        # BAM OUTPUT
        self.bam_file = None
        if bam_header is not None:
            self.bam_file = bgzf.BgzfWriter(
                bam, 'w', compresslevel=BAM_COMPRESSION_LEVEL)

            # WRITE BAM HEADER
            self.bam_file.write("BAM\1")
            header = '@HD\tVN:1.5\tSO:coordinate\n'
            for n in bam_header[0]:
                header += '@SQ\tSN:' + n[0] + '\tLN:' + str(n[3]) + '\n'
            header += '@RG\tID:NEAT\tSM:NEAT\tLB:NEAT\tPL:NEAT\n'
            header_bytes = len(header)
            num_refs = len(bam_header[0])
            self.bam_file.write(pack('<i', header_bytes))
            self.bam_file.write(header)
            self.bam_file.write(pack('<i', num_refs))

            for n in bam_header[0]:
                l_name = len(n[0]) + 1
                self.bam_file.write(pack('<i', l_name))
                self.bam_file.write(n[0] + '\0')
                self.bam_file.write(pack('<i', n[3]))

        # buffers for more efficient writing
        self.fq1_buffer = []
        self.fq2_buffer = []
        self.bam_buffer = []
Ejemplo n.º 29
0
from Bio import SeqIO, bgzf
from gzip import open as gzopen
import random

fq1 = SeqIO.parse(gzopen(
    "/home/wanghm/whm/ATAC/S0821_05A_CHG036758-Lane41-PH1-7d-ACAGTGGT_L001_R1.fastq.gz",
    "rt"),
                  format="fastq")
fq2 = SeqIO.parse(gzopen(
    "/home/wanghm/whm/ATAC/S0821_05A_CHG036758-Lane41-PH1-7d-ACAGTGGT_L001_R2.fastq.gz",
    "rt"),
                  format="fastq")

handle_out_rep1_r1 = bgzf.BgzfWriter(
    "/home/wanghm/whm/ATAC/split/sex_rep1_r1.fastq.gz", "ab")
handle_out_rep1_r2 = bgzf.BgzfWriter(
    "/home/wanghm/whm/ATAC/split/sex_rep1_r2.fastq.gz", "ab")
rep1_count = 0
handle_out_rep2_r1 = bgzf.BgzfWriter(
    "/home/wanghm/whm/ATAC/split/sex_rep2_r1.fastq.gz", "ab")
handle_out_rep2_r2 = bgzf.BgzfWriter(
    "/home/wanghm/whm/ATAC/split/sex_rep2_r2.fastq.gz", "ab")
rep2_count = 0

ll = [[handle_out_rep1_r1, handle_out_rep1_r2, rep1_count],
      [handle_out_rep2_r1, handle_out_rep2_r2, rep2_count]]

reads_count = 42304520  # reads count in fastq file
for seq in zip(fq1, fq2):

    tmp_repo = random.choice(ll)  # choose random element in list
Ejemplo n.º 30
0
#import ipdb;  ipdb.set_trace()

#sys.stderr.write('\nprocessing the reads...')

for reads_file in options.reads:

    #  output filename
    output = reads_file.split('.')[0] + '.collapsed.fasta.gz'

    #  dictionary to count multiplicities of unique read sequences as key values, preserving the order of appearance
    unique_seqs = OrderedDict()

    with gzip.open(reads_file, 'rt') as fd:
        for r in SeqIO.parse(fd, 'fastq'):
            seq = str(r.seq)
            _ = unique_seqs.setdefault(
                seq, int(0)
            )  #  it the key is there the count is returned, otherwise the count is set to zero
            unique_seqs[seq] += 1

    #  save FASTA records one at a time, let the OS deal with the IO...
    with bgzf.BgzfWriter(output, 'wb') as fd:
        for n, (seq, count) in enumerate(unique_seqs.items()):
            SeqIO.write(
                SeqRecord(Seq(seq, IUPAC.ambiguous_dna),
                          id='read_' + str(n + 1) + '_x' + str(count),
                          description=''), fd, 'fasta')

#sys.stderr.write('done!\n')