コード例 #1
0
	def setUp(self):
		
		self.fastq = pyfastx.Fastq(gzip_fastq)

		#reload index
		self.fastq = pyfastx.Fastq(gzip_fastq)

		#flat fastq
		self.flatq = pyfastx.Fastq(flat_fastq)

		self.reads = {}
		self.bases = {'A': 0, 'T': 0, 'G': 0, 'C':0, 'N':0}
		i = 0
		c = -1
		with open(flat_fastq) as fh:
			for line in fh:
				i += 1
				
				if i % 4 == 1:
					c += 1
					self.reads[c] = [line[1:].strip().split()[0], 0, 0]

				elif i % 4 == 2:
					self.reads[c][1] = line.strip()
					
					self.bases['A'] += line.count('A')
					self.bases['T'] += line.count('T')
					self.bases['G'] += line.count('G')
					self.bases['C'] += line.count('C')
					self.bases['N'] += line.count('N')

				elif i % 4 == 0:
					self.reads[c][2] = line.strip()
コード例 #2
0
ファイル: test_fastq.py プロジェクト: wx904/pyfastx
    def test_build(self):
        del self.fastq

        if os.path.exists('{}.fxi'.format(gzip_fastq)):
            os.remove('{}.fxi'.format(gzip_fastq))

        fq = pyfastx.Fastq(gzip_fastq, build_index=False)
        fq.build_index()

        self.fastq = pyfastx.Fastq(gzip_fastq)
コード例 #3
0
def load_seqfile(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=True)
    elif os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=True)
    return seqfile
コード例 #4
0
def fastx_info(args):
	fastx_type = fastx_format_check(args.fastx)

	if fastx_type == 'fasta':
		fa = pyfastx.Fasta(args.fastx)
		comp = fa.composition
		print("Sequence counts: {}".format(len(fa)))
		print("Total bases: {}".format(fa.size))
		print("GC content: {:.2f}%".format(fa.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Mean length: {:.2f}".format(fa.mean))
		print("Median length: {:.2f}".format(fa.median))
		print("Max length: {}".format(len(fa.longest)))
		print("Min length: {}".format(len(fa.shortest)))
		print("N50, L50: {}, {}".format(*fa.nl()))
		print("length >= 1000: {}".format(fa.count(1000)))

	elif fastx_type == 'fastq':
		fq = pyfastx.Fastq(args.fastx)
		comp = fq.composition
		print("Read counts: {}".format(len(fq)))
		print("Total bases: {}".format(fq.size))
		print("GC content: {:.2f}%".format(fq.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Quality encoding system maybe: {}".format(", ".join(fq.encoding_type)))
コード例 #5
0
ファイル: utils.py プロジェクト: np-core/nanopath
def get_output_handle(fpath: str, fastx: bool = False, out: bool = True):

    if fpath == "-":
        if out:
            handle = sys.stdout
        else:
            handle = sys.stdin
    else:
        p = Path(fpath)
        if not p.parent.is_dir():
            raise NotADirectoryError(
                "Directory specified for output file does not exist: {}".format(
                    p.parent
                )
            )

        if fastx:
            if fpath.endswith("a"):
                handle = pyfastx.Fasta(p)
            else:
                handle = pyfastx.Fastq(p)
        else:
            handle = p.open("w")

    return handle
コード例 #6
0
def run(args, name):
    count = 0
    # read fastq line and extract parts
    for read in pyfastx.Fastq(args.inputFile):
        count += 1
        split = read.description.split(':')
        runid = f"{split[0].replace('@', '')}_0{split[1]}_A{split[2]}"
        barcode = split[-1]
        if count == 1:
            break

    # list of tuples of input for JSON
    results = [("runID", runid),
               ("barcode", barcode)]

    # if JSON is present use exiting, else create new unique name
    JSON = storage.JSON()
    if not args.JSON:
        JSON.name(args.sample)
    JSON.open(args.JSON)
    JSON.add_results(args.name, results)
    JSON.pretty_print()
    JSON.write(args.outputDir)

    logging.info(results)
コード例 #7
0
def fastq_sample(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.num is not None and args.num > 0:
		seq_num = args.num
		if seq_num > len(fq):
			seq_num = len(fq)

	elif args.num is not None and 0 < args.prop <= 1:
		seq_num = round(len(fq)*args.prop)
		if seq_num == 0:
			raise RuntimeError("the proportion is too small")

	else:
		raise RuntimeError("specify a right number for seq number or proportion")

	selected = random.sample(range(len(fq)), k=seq_num)

	if args.outfile is None:
		fw = sys.stdout
	else:
		fw = open(args.outfile, 'w')

	for idx in selected:
		r = fq[idx]
		fw.write("@{}\n{}\n+\n{}\n".format(r.name, r.seq, r.qual))

	if args.outfile is None:
		fw.flush()
	else:
		fw.close()
コード例 #8
0
	def test_iter_tuple(self):
		i = -1
		for name, seq, qual in pyfastx.Fastq(flat_fastq, build_index=False):
			i += 1
			self.assertEqual(name, self.reads[i][0])
			self.assertEqual(seq, self.reads[i][1])
			self.assertEqual(qual, self.reads[i][2])
コード例 #9
0
ファイル: utils.py プロジェクト: Wytamma/sketchy
def create_fastx_index(fastx):

    if is_fasta(fastx):
        return pyfastx.Fasta(str(fastx), build_index=True), build_read_fasta
    elif is_fastq(fastx):
        return pyfastx.Fastq(str(fastx), build_index=True), build_read_fastq
    else:
        raise ValueError(f'Could not determine input file format: {fastx}')
コード例 #10
0
ファイル: index.py プロジェクト: sjin09/fastx
def build_index(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile):
        print("fxi index is present")
    else:
        print("buliding fxi index for {}".format(infile))
        if infile.endswith((".fa", ".fa.gz", ".fasta", ".fasta.gz")):
            pyfastx.Fasta(infile)
        else:
            pyfastx.Fastq(infile)
        print("fxi index has been created for {}".format(infile))
コード例 #11
0
ファイル: test_fastq.py プロジェクト: wx904/pyfastx
    def test_exception(self):
        with self.assertRaises(FileExistsError):
            _ = pyfastx.Fastq('a_fastq_file_not_exists')

        with self.assertRaises(IndexError):
            _ = self.fastq[len(self.fastq)]

        with self.assertRaises(KeyError):
            _ = self.fastq[int]

        with self.assertRaises(KeyError):
            _ = self.fastq['abc']
コード例 #12
0
ファイル: utils.py プロジェクト: np-core/nanopath
def create_fastx_index(fastx: Path) -> (pyfastx.Fasta, Path):

    if is_fasta(fastx):
        return pyfastx.Fasta(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    elif is_fastq(fastx):
        return pyfastx.Fastq(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    else:
        raise ValueError(
            f'Could not determine input file format: {fastx}'
        )
コード例 #13
0
def fastx_fq2fa(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.outfile:
		fh = open(args.outfile, 'w')
	else:
		fh = sys.stdout

	for read in fq:
		fh.write(">{}\n{}\n".format(read.name, read.seq))

	if args.outfile:
		fh.close()
	else:
		fh.flush()
コード例 #14
0
ファイル: utils.py プロジェクト: np-core/nanopath
    def prepare_fastq(self) -> dict:

        """ Checks file paths of input files and creates indices """

        fastq = {}
        for organism, data in self.composition.items():
            file = data['file']
            file_path = Path(file)
            if not file_path.exists():
                raise ValueError(f'File {file_path} does not exist.')
            else:
                fastq[organism] = pyfastx.Fastq(file)

        self.logger.info('Prepared read files - proceeding')

        return fastq
コード例 #15
0
def fastq_split(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.file_num:
		seqs_num = math.ceil(len(fq)/args.file_num)
		parts_num = args.file_num
	else:
		seqs_num = args.seq_count
		parts_num = math.ceil(len(fq)/seqs_num)

	name, suffix1 = os.path.splitext(os.path.basename(args.fastx))

	if fq.is_gzip:
		name, suffix2 = os.path.splitext(name)

	digit = len(str(parts_num))

	seq_write = 0
	fh = None
	file_num = 0

	for read in fq:
		if seq_write == 0:
			file_num += 1

			if fq.is_gzip:
				subfile = "{}.{}{}{}".format(name, str(file_num).zfill(digit), suffix2, suffix1)
			else:
				subfile = "{}.{}{}".format(name, str(file_num).zfill(digit), suffix1)

			if args.outdir is not None:
				subfile = os.path.join(args.outdir, subfile)

			if fq.is_gzip:
				fh = gzip.open(subfile, 'wt')
			else:
				fh = open(subfile, 'w')

		fh.write("@{}\n{}\n+\n{}\n".format(read.name, read.seq, read.qual))
		seq_write += 1

		if seq_write == seqs_num:
			fh.close()
			seq_write = 0
	
	fh.close()
コード例 #16
0
    def setUp(self):
        self.fastq = pyfastx.Fastq(gzip_fastq)

        with open(flat_fastq) as fh:
            self.keys = [line.split()[0][1:] for line in fh if line[0] == '@']
        self.count = len(self.keys)
コード例 #17
0
import sys
import pyfastx

for read in pyfastx.Fastq(sys.argv[1]):
    print(read.name)
コード例 #18
0
import sys
import pyfastx

pyfastx.Fastq(sys.argv[1])
コード例 #19
0
import sys
import random
import pyfastx

random.seed(sys.argv[1])

for fqfile in sys.argv[2:]:
    fq = pyfastx.Fastq(fqfile)
    samples = set(random.sample(range(len(fq)), 10000))
    with open("{}.list".format(fqfile), 'w') as fw:
        for r in fq:
            if (r.id - 1) in samples:
                fw.write("{}\n".format(r.name))

    print(fqfile)
コード例 #20
0
ファイル: cutsite.py プロジェクト: koszullab/hicstuff
def cut_ligation_sites(
    fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu
):
    """Create new reads to manage pairs with a digestion and create multiple
    pairs to take into account all the contact present.

    The function write two files for both the forward and reverse fastq with the
    new reads. The new reads have at the end of the ID ":[0-9]" added to
    differentiate the different pairs created from one read.

    The function will look for all the sites present and create new pairs of
    reads according to the mode given to retreive as much as possible of the HiC
    signal.

    Parameters
    ----------
    fq_for : str
        Path to the forward fastq file to digest.
    fq_rev : str
        Path to the reverse fatsq file to digest.
    digest_for : str
        Path to the output digested forward fatsq file to write.
    digest_rev : str
        Path to the output digested reverse fatsq file to write.
    enzyme : str
        The list of restriction enzyme used to digest the genome separated by a
        comma. Example: HpaII,MluCI.
    mode : str
        Mode to use to make the digestion. Three values possible: "all",
        "for_vs_rev", "pile".
    seed_size : int
        Minimum size of a fragment (i.e. seed size used in mapping as reads 
        smaller won't be mapped.)
    n_cpu : int
        Number of CPUs.
    """
    # Process the ligation sites given
    ligation_sites = hcd.gen_enzyme_religation_regex(enzyme)

    # Defined stop_token which is used to mark the end of input file
    stop_token = "STOP"
    # A stack is a string cointaining multiple read pairs
    max_stack_size = 1000

    # Create count to have an idea of the digested pairs repartition.
    original_number_of_pairs = 0
    final_number_of_pairs = 0
    new_reads_for = ""
    new_reads_rev = ""
    current_stack = 0

    # Start parallel threading to compute the
    # ctx = multiprocessing.get_context("spawn")
    queue = multiprocessing.Queue(max(1, n_cpu - 1))
    writer_process = multiprocessing.Process(
        target=_writer, args=(digest_for, digest_rev, queue, stop_token)
    )
    writer_process.start()

    # Iterate on all pairs
    for read_for, read_rev in zip(
        pyfastx.Fastq(fq_for, build_index=False),
        pyfastx.Fastq(fq_rev, build_index=False),
    ):

        # Count the numbers of original reads processed.
        original_number_of_pairs += 1

        # Count for stack size.
        current_stack += 1

        # Extract components of the reads.
        for_name, for_seq, for_qual = read_for
        rev_name, rev_seq, rev_qual = read_rev

        # Sanity check to be sure all reads are with their mate.
        if for_name != rev_name:
            logger.error(
                "The fastq files contains reads not sorted :\n{0}\n{1}".format(
                    read_for.id, read_rev.id
                )
            )
            sys.exit(1)

        # Cut the forward and reverse reads at the ligation sites.
        for_seq_list, for_qual_list = cutsite_read(
            ligation_sites, for_seq, for_qual, seed_size,
        )
        rev_seq_list, rev_qual_list = cutsite_read(
            ligation_sites, rev_seq, rev_qual, seed_size,
        )

        # Write the new combinations of fragments.
        new_reads_for, new_reads_rev, final_number_of_pairs = write_pair(
            new_reads_for,
            new_reads_rev,
            for_name,
            for_seq_list,
            for_qual_list,
            rev_seq_list,
            rev_qual_list,
            mode,
            final_number_of_pairs,
        )

        # If stack full, add it in the queue.
        if current_stack == max_stack_size:

            # Add the pair in the queue.
            pairs = (new_reads_for.encode(), new_reads_rev.encode())
            queue.put(pairs)

            # Empty the stack
            current_stack = 0
            new_reads_for = ""
            new_reads_rev = ""

    # End the parallel processing.
    pairs = (new_reads_for.encode(), new_reads_rev.encode())
    queue.put(pairs)
    queue.put(stop_token)
    writer_process.join()

    # Return information on the different pairs created
    logger.info(f"Library used: {fq_for} - {fq_rev}")
    logger.info(
        f"Number of pairs before digestion: {original_number_of_pairs}"
    )
    logger.info(
        f"Number of pairs after digestion: {final_number_of_pairs}"
    )
コード例 #21
0
#!/usr/bin/env python

if __name__ == "__main__":
	import sys, re, gzip, pyfastx
	if len(sys.argv) == 1:
		print("Usage: fqcnt.py <in.fq.gz>")
		sys.exit(0)
	n, slen, qlen = 0, 0, 0
	for name, seq, qual in pyfastx.Fastq(sys.argv[1], build_index=False):
		n += 1
		slen += len(seq)
		qlen += qual and len(qual) or 0
	print('{}\t{}\t{}'.format(n, slen, qlen))
コード例 #22
0
ファイル: test_fastq.py プロジェクト: wx904/pyfastx
    def test_full_name(self):
        fq = pyfastx.Fastq(flat_fastq, build_index=False, full_name=True)

        for name, _, _ in fq:
            self.assertTrue(name, self.fastq[name.split()[0]].description)
コード例 #23
0
import sys
import pyfastx
fq = pyfastx.Fastq(sys.argv[2])

with open(sys.argv[1]) as fh:
    for line in fh:
        name = line.strip()
        print(fq[name].seq)