Python Fastqの例、pyfastx.Fastq Pythonの例

コード例 #1

0

ファイルを表示

	def setUp(self):
		
		self.fastq = pyfastx.Fastq(gzip_fastq)

		#reload index
		self.fastq = pyfastx.Fastq(gzip_fastq)

		#flat fastq
		self.flatq = pyfastx.Fastq(flat_fastq)

		self.reads = {}
		self.bases = {'A': 0, 'T': 0, 'G': 0, 'C':0, 'N':0}
		i = 0
		c = -1
		with open(flat_fastq) as fh:
			for line in fh:
				i += 1
				
				if i % 4 == 1:
					c += 1
					self.reads[c] = [line[1:].strip().split()[0], 0, 0]

				elif i % 4 == 2:
					self.reads[c][1] = line.strip()
					
					self.bases['A'] += line.count('A')
					self.bases['T'] += line.count('T')
					self.bases['G'] += line.count('G')
					self.bases['C'] += line.count('C')
					self.bases['N'] += line.count('N')

				elif i % 4 == 0:
					self.reads[c][2] = line.strip()

コード例 #2

0

ファイルを表示

ファイル: test_fastq.py プロジェクト: wx904/pyfastx

    def test_build(self):
        del self.fastq

        if os.path.exists('{}.fxi'.format(gzip_fastq)):
            os.remove('{}.fxi'.format(gzip_fastq))

        fq = pyfastx.Fastq(gzip_fastq, build_index=False)
        fq.build_index()

        self.fastq = pyfastx.Fastq(gzip_fastq)

コード例 #3

0

ファイルを表示

def load_seqfile(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=True)
    elif os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=True)
    return seqfile

コード例 #4

0

ファイルを表示

def fastx_info(args):
	fastx_type = fastx_format_check(args.fastx)

	if fastx_type == 'fasta':
		fa = pyfastx.Fasta(args.fastx)
		comp = fa.composition
		print("Sequence counts: {}".format(len(fa)))
		print("Total bases: {}".format(fa.size))
		print("GC content: {:.2f}%".format(fa.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Mean length: {:.2f}".format(fa.mean))
		print("Median length: {:.2f}".format(fa.median))
		print("Max length: {}".format(len(fa.longest)))
		print("Min length: {}".format(len(fa.shortest)))
		print("N50, L50: {}, {}".format(*fa.nl()))
		print("length >= 1000: {}".format(fa.count(1000)))

	elif fastx_type == 'fastq':
		fq = pyfastx.Fastq(args.fastx)
		comp = fq.composition
		print("Read counts: {}".format(len(fq)))
		print("Total bases: {}".format(fq.size))
		print("GC content: {:.2f}%".format(fq.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Quality encoding system maybe: {}".format(", ".join(fq.encoding_type)))

コード例 #5

0

ファイルを表示

ファイル: utils.py プロジェクト: np-core/nanopath

def get_output_handle(fpath: str, fastx: bool = False, out: bool = True):

    if fpath == "-":
        if out:
            handle = sys.stdout
        else:
            handle = sys.stdin
    else:
        p = Path(fpath)
        if not p.parent.is_dir():
            raise NotADirectoryError(
                "Directory specified for output file does not exist: {}".format(
                    p.parent
                )
            )

        if fastx:
            if fpath.endswith("a"):
                handle = pyfastx.Fasta(p)
            else:
                handle = pyfastx.Fastq(p)
        else:
            handle = p.open("w")

    return handle

コード例 #6

0

ファイルを表示

def run(args, name):
    count = 0
    # read fastq line and extract parts
    for read in pyfastx.Fastq(args.inputFile):
        count += 1
        split = read.description.split(':')
        runid = f"{split[0].replace('@', '')}_0{split[1]}_A{split[2]}"
        barcode = split[-1]
        if count == 1:
            break

    # list of tuples of input for JSON
    results = [("runID", runid),
               ("barcode", barcode)]

    # if JSON is present use exiting, else create new unique name
    JSON = storage.JSON()
    if not args.JSON:
        JSON.name(args.sample)
    JSON.open(args.JSON)
    JSON.add_results(args.name, results)
    JSON.pretty_print()
    JSON.write(args.outputDir)

    logging.info(results)

コード例 #7

0

ファイルを表示

def fastq_sample(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.num is not None and args.num > 0:
		seq_num = args.num
		if seq_num > len(fq):
			seq_num = len(fq)

	elif args.num is not None and 0 < args.prop <= 1:
		seq_num = round(len(fq)*args.prop)
		if seq_num == 0:
			raise RuntimeError("the proportion is too small")

	else:
		raise RuntimeError("specify a right number for seq number or proportion")

	selected = random.sample(range(len(fq)), k=seq_num)

	if args.outfile is None:
		fw = sys.stdout
	else:
		fw = open(args.outfile, 'w')

	for idx in selected:
		r = fq[idx]
		fw.write("@{}\n{}\n+\n{}\n".format(r.name, r.seq, r.qual))

	if args.outfile is None:
		fw.flush()
	else:
		fw.close()

コード例 #8

0

ファイルを表示

	def test_iter_tuple(self):
		i = -1
		for name, seq, qual in pyfastx.Fastq(flat_fastq, build_index=False):
			i += 1
			self.assertEqual(name, self.reads[i][0])
			self.assertEqual(seq, self.reads[i][1])
			self.assertEqual(qual, self.reads[i][2])

コード例 #9

0

ファイルを表示

ファイル: utils.py プロジェクト: Wytamma/sketchy

def create_fastx_index(fastx):

    if is_fasta(fastx):
        return pyfastx.Fasta(str(fastx), build_index=True), build_read_fasta
    elif is_fastq(fastx):
        return pyfastx.Fastq(str(fastx), build_index=True), build_read_fastq
    else:
        raise ValueError(f'Could not determine input file format: {fastx}')

コード例 #10

0

ファイルを表示

ファイル: index.py プロジェクト: sjin09/fastx

def build_index(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile):
        print("fxi index is present")
    else:
        print("buliding fxi index for {}".format(infile))
        if infile.endswith((".fa", ".fa.gz", ".fasta", ".fasta.gz")):
            pyfastx.Fasta(infile)
        else:
            pyfastx.Fastq(infile)
        print("fxi index has been created for {}".format(infile))

コード例 #11

0

ファイルを表示

ファイル: test_fastq.py プロジェクト: wx904/pyfastx

    def test_exception(self):
        with self.assertRaises(FileExistsError):
            _ = pyfastx.Fastq('a_fastq_file_not_exists')

        with self.assertRaises(IndexError):
            _ = self.fastq[len(self.fastq)]

        with self.assertRaises(KeyError):
            _ = self.fastq[int]

        with self.assertRaises(KeyError):
            _ = self.fastq['abc']

コード例 #12

0

ファイルを表示

ファイル: utils.py プロジェクト: np-core/nanopath

def create_fastx_index(fastx: Path) -> (pyfastx.Fasta, Path):

    if is_fasta(fastx):
        return pyfastx.Fasta(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    elif is_fastq(fastx):
        return pyfastx.Fastq(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    else:
        raise ValueError(
            f'Could not determine input file format: {fastx}'
        )

コード例 #13

0

ファイルを表示

def fastx_fq2fa(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.outfile:
		fh = open(args.outfile, 'w')
	else:
		fh = sys.stdout

	for read in fq:
		fh.write(">{}\n{}\n".format(read.name, read.seq))

	if args.outfile:
		fh.close()
	else:
		fh.flush()

コード例 #14

0

ファイルを表示

ファイル: utils.py プロジェクト: np-core/nanopath

    def prepare_fastq(self) -> dict:

        """ Checks file paths of input files and creates indices """

        fastq = {}
        for organism, data in self.composition.items():
            file = data['file']
            file_path = Path(file)
            if not file_path.exists():
                raise ValueError(f'File {file_path} does not exist.')
            else:
                fastq[organism] = pyfastx.Fastq(file)

        self.logger.info('Prepared read files - proceeding')

        return fastq

コード例 #15

0

ファイルを表示

def fastq_split(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.file_num:
		seqs_num = math.ceil(len(fq)/args.file_num)
		parts_num = args.file_num
	else:
		seqs_num = args.seq_count
		parts_num = math.ceil(len(fq)/seqs_num)

	name, suffix1 = os.path.splitext(os.path.basename(args.fastx))

	if fq.is_gzip:
		name, suffix2 = os.path.splitext(name)

	digit = len(str(parts_num))

	seq_write = 0
	fh = None
	file_num = 0

	for read in fq:
		if seq_write == 0:
			file_num += 1

			if fq.is_gzip:
				subfile = "{}.{}{}{}".format(name, str(file_num).zfill(digit), suffix2, suffix1)
			else:
				subfile = "{}.{}{}".format(name, str(file_num).zfill(digit), suffix1)

			if args.outdir is not None:
				subfile = os.path.join(args.outdir, subfile)

			if fq.is_gzip:
				fh = gzip.open(subfile, 'wt')
			else:
				fh = open(subfile, 'w')

		fh.write("@{}\n{}\n+\n{}\n".format(read.name, read.seq, read.qual))
		seq_write += 1

		if seq_write == seqs_num:
			fh.close()
			seq_write = 0
	
	fh.close()

コード例 #16

0

ファイルを表示

    def setUp(self):
        self.fastq = pyfastx.Fastq(gzip_fastq)

        with open(flat_fastq) as fh:
            self.keys = [line.split()[0][1:] for line in fh if line[0] == '@']
        self.count = len(self.keys)

コード例 #17

0

ファイルを表示

import sys
import pyfastx

for read in pyfastx.Fastq(sys.argv[1]):
    print(read.name)

コード例 #18

0

ファイルを表示

ファイル: pyfastx_fastq_build_index.py プロジェクト: wx904/pyfastx

import sys
import pyfastx

pyfastx.Fastq(sys.argv[1])

コード例 #19

0

ファイルを表示

import sys
import random
import pyfastx

random.seed(sys.argv[1])

for fqfile in sys.argv[2:]:
    fq = pyfastx.Fastq(fqfile)
    samples = set(random.sample(range(len(fq)), 10000))
    with open("{}.list".format(fqfile), 'w') as fw:
        for r in fq:
            if (r.id - 1) in samples:
                fw.write("{}\n".format(r.name))

    print(fqfile)

コード例 #20

0

ファイルを表示

ファイル: cutsite.py プロジェクト: koszullab/hicstuff

def cut_ligation_sites(
    fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu
):
    """Create new reads to manage pairs with a digestion and create multiple
    pairs to take into account all the contact present.

    The function write two files for both the forward and reverse fastq with the
    new reads. The new reads have at the end of the ID ":[0-9]" added to
    differentiate the different pairs created from one read.

    The function will look for all the sites present and create new pairs of
    reads according to the mode given to retreive as much as possible of the HiC
    signal.

    Parameters
    ----------
    fq_for : str
        Path to the forward fastq file to digest.
    fq_rev : str
        Path to the reverse fatsq file to digest.
    digest_for : str
        Path to the output digested forward fatsq file to write.
    digest_rev : str
        Path to the output digested reverse fatsq file to write.
    enzyme : str
        The list of restriction enzyme used to digest the genome separated by a
        comma. Example: HpaII,MluCI.
    mode : str
        Mode to use to make the digestion. Three values possible: "all",
        "for_vs_rev", "pile".
    seed_size : int
        Minimum size of a fragment (i.e. seed size used in mapping as reads 
        smaller won't be mapped.)
    n_cpu : int
        Number of CPUs.
    """
    # Process the ligation sites given
    ligation_sites = hcd.gen_enzyme_religation_regex(enzyme)

    # Defined stop_token which is used to mark the end of input file
    stop_token = "STOP"
    # A stack is a string cointaining multiple read pairs
    max_stack_size = 1000

    # Create count to have an idea of the digested pairs repartition.
    original_number_of_pairs = 0
    final_number_of_pairs = 0
    new_reads_for = ""
    new_reads_rev = ""
    current_stack = 0

    # Start parallel threading to compute the
    # ctx = multiprocessing.get_context("spawn")
    queue = multiprocessing.Queue(max(1, n_cpu - 1))
    writer_process = multiprocessing.Process(
        target=_writer, args=(digest_for, digest_rev, queue, stop_token)
    )
    writer_process.start()

    # Iterate on all pairs
    for read_for, read_rev in zip(
        pyfastx.Fastq(fq_for, build_index=False),
        pyfastx.Fastq(fq_rev, build_index=False),
    ):

        # Count the numbers of original reads processed.
        original_number_of_pairs += 1

        # Count for stack size.
        current_stack += 1

        # Extract components of the reads.
        for_name, for_seq, for_qual = read_for
        rev_name, rev_seq, rev_qual = read_rev

        # Sanity check to be sure all reads are with their mate.
        if for_name != rev_name:
            logger.error(
                "The fastq files contains reads not sorted :\n{0}\n{1}".format(
                    read_for.id, read_rev.id
                )
            )
            sys.exit(1)

        # Cut the forward and reverse reads at the ligation sites.
        for_seq_list, for_qual_list = cutsite_read(
            ligation_sites, for_seq, for_qual, seed_size,
        )
        rev_seq_list, rev_qual_list = cutsite_read(
            ligation_sites, rev_seq, rev_qual, seed_size,
        )

        # Write the new combinations of fragments.
        new_reads_for, new_reads_rev, final_number_of_pairs = write_pair(
            new_reads_for,
            new_reads_rev,
            for_name,
            for_seq_list,
            for_qual_list,
            rev_seq_list,
            rev_qual_list,
            mode,
            final_number_of_pairs,
        )

        # If stack full, add it in the queue.
        if current_stack == max_stack_size:

            # Add the pair in the queue.
            pairs = (new_reads_for.encode(), new_reads_rev.encode())
            queue.put(pairs)

            # Empty the stack
            current_stack = 0
            new_reads_for = ""
            new_reads_rev = ""

    # End the parallel processing.
    pairs = (new_reads_for.encode(), new_reads_rev.encode())
    queue.put(pairs)
    queue.put(stop_token)
    writer_process.join()

    # Return information on the different pairs created
    logger.info(f"Library used: {fq_for} - {fq_rev}")
    logger.info(
        f"Number of pairs before digestion: {original_number_of_pairs}"
    )
    logger.info(
        f"Number of pairs after digestion: {final_number_of_pairs}"
    )

コード例 #21

0

ファイルを表示

ファイル: fqcnt_py6x_pyfx.py プロジェクト: thomasvangurp/biofast

#!/usr/bin/env python

if __name__ == "__main__":
	import sys, re, gzip, pyfastx
	if len(sys.argv) == 1:
		print("Usage: fqcnt.py <in.fq.gz>")
		sys.exit(0)
	n, slen, qlen = 0, 0, 0
	for name, seq, qual in pyfastx.Fastq(sys.argv[1], build_index=False):
		n += 1
		slen += len(seq)
		qlen += qual and len(qual) or 0
	print('{}\t{}\t{}'.format(n, slen, qlen))

コード例 #22

0

ファイルを表示

ファイル: test_fastq.py プロジェクト: wx904/pyfastx

    def test_full_name(self):
        fq = pyfastx.Fastq(flat_fastq, build_index=False, full_name=True)

        for name, _, _ in fq:
            self.assertTrue(name, self.fastq[name.split()[0]].description)

コード例 #23

0

ファイルを表示

ファイル: pyfastx_fastq_random_access.py プロジェクト: wx904/pyfastx

import sys
import pyfastx
fq = pyfastx.Fastq(sys.argv[2])

with open(sys.argv[1]) as fh:
    for line in fh:
        name = line.strip()
        print(fq[name].seq)