Python xopenの例、xopen.xopen Pythonの例

コード例 #1

0

ファイルを表示

	def open2(path1, path2):
		file1 = file2 = None
		if path1 is not None:
			file1 = xopen(path1, 'w')
			if path2 is not None:
				file2 = xopen(path2, 'w')
		return file1, file2

コード例 #2

0

ファイルを表示

    def run_pe_p7_bc(self):
        """
        structure: 
        |--_tmp
        |--index1
        |    |--barcode
        |    |    |--file.fq
        """
        # step1. index1
        outdir1 = os.path.join(self.outdir, '_tmp')
        with xopen(self.fq1, 'rt') as r1, xopen(self.fq2, 'rt') as r2:
            self.index_pe(r1, r2, outdir1)
        # rename files in dirs/
        flist1 = self.wrap_dir(outdir1, mode='index1')
        flist1 = sorted(flist1)  # sort, read1/read2

        # step2. index2
        # step3. barcode
        # multiple jobs
        n_jobs = min(self.parallel_jobs, len(flist1[0::2]))  # number of jobs
        with Pool(processes=n_jobs) as pool:
            pool.map(self.run_pe_barcode_single, flist1[0::2])  # read1
        # for fq1 in flist1[0::2]: # read1
        #     self.run_pe_barcode_single(fq1)

        ## step4. rename files
        self.wrap_read_count()
        self.wrap_file()

コード例 #3

0

ファイルを表示

ファイル: bert_fine_tune_multigpu.py プロジェクト: TurkuNLP/multilingual-register-labeling

    def __init__(self, model):

        if args.print_lr:
            t = K.cast(model.optimizer.iterations, K.floatx()) + 1
            self.lr = K.switch(
                t <= model.optimizer.warmup_steps,
                model.optimizer.lr * (t / model.optimizer.warmup_steps),
                model.optimizer.min_lr +
                (model.optimizer.lr - model.optimizer.min_lr) *
                (1.0 - K.minimum(t, model.optimizer.decay_steps) /
                 model.optimizer.decay_steps),
            )

        self.best_f1 = 0
        self.best_f1_epoch = 0
        self.best_f1_threshold = 0

        if args.label_mapping is not None:
            file_name = args.dev_all
        else:
            file_name = args.dev
        with xopen(file_name, "rt") as f:
            example_count, label_dim = json.loads(f.readline())
            self.all_labels = lil_matrix((example_count, label_dim), dtype='b')
            for i, line in tqdm(enumerate(f), desc="Reading dev labels"):
                self.all_labels[i, json.loads(line)[1]] = 1
            print("Dev labels shape:", self.all_labels.shape)
        if args.dev_all is not None:
            with xopen(args.label_mapping) as f:
                self.labels_mapping = json.loads(f.read())

コード例 #4

0

ファイルを表示

 def open2(path1, path2):
     file1 = file2 = None
     if path1 is not None:
         file1 = xopen(path1, 'wb', compresslevel=compression_level)
         if path2 is not None:
             file2 = xopen(path2, 'wb', compresslevel=compression_level)
     return file1, file2

コード例 #5

0

ファイルを表示

ファイル: bam_preprocess.py プロジェクト: TF-Chan-Lab/rG4-seeker

    def _random_generate(self):
        """
        Generate sorted random number tables
        """
        for chrom, no_of_reads in self.no_of_reads_by_chromosome:
            with xopen(self.out_prefix + '.{0}.bootstrap_{1:02d}.randtable.gz'.format(chrom, 0), 'wb') as fw:
                for line_id, read_id in enumerate(self.all_line_id_to_read_id[chrom]):
                    fw.write("{0}\t{1}\n".format(line_id, 1).encode('utf-8'))

        random.seed(a=RANDOM_SEED, version=2)
        bootstrap_seeds = []
        for i in range(self.no_of_bootstraps):
            bootstrap_seeds.append(random.randrange(self.total_no_of_processed_reads))

        for i in range(self.no_of_bootstraps):
            logging.debug(
                "[{0}] Generating random number tables for bootstrap {0}, BAM file {1}".format(time.ctime(), i + 1,
                                                                                               self.in_bam))
            random.seed(a=bootstrap_seeds[i], version=2)
            count_table = array.array('l', [0] * self.total_no_of_processed_reads)
            for n in range(self.total_no_of_processed_reads):
                count_table[random.randrange(self.total_no_of_processed_reads)] += 1
            for chrom, no_of_reads in self.no_of_reads_by_chromosome:
                with xopen(self.out_prefix + '.{0}.bootstrap_{1:02d}.randtable.gz'.format(chrom, i + 1),
                           mode='wb') as fw:
                    for line_id, read_id in enumerate(self.all_line_id_to_read_id[chrom]):
                        if count_table[read_id]:
                            fw.write("{0}\t{1}\n".format(line_id, count_table[read_id]).encode('utf-8'))

コード例 #6

0

ファイルを表示

 def concat(self):
     for concat_covrss, split_covrss_list in self.concat_dict.items():
         with xopen(concat_covrss, 'wb', compresslevel=9) as fw:
             for split_covrss in split_covrss_list:
                 with xopen(split_covrss, 'rb') as f:
                     for line in f:
                         fw.write(line)

コード例 #7

0

ファイルを表示

    def run(self):
        if self.stdin_fd != -1:
            sys.stdin.close()
            sys.stdin = os.fdopen(self.stdin_fd)
        try:
            with xopen(self.file, 'rb') as f:
                if self.file2:
                    with xopen(self.file2, 'rb') as f2:
                        for chunk_index, (chunk1, chunk2) in enumerate(
                                dnaio.read_paired_chunks(
                                    f, f2, self.buffer_size)):
                            self.send_to_worker(chunk_index, chunk1, chunk2)
                else:
                    for chunk_index, chunk in enumerate(
                            dnaio.read_chunks(f, self.buffer_size)):
                        self.send_to_worker(chunk_index, chunk)

            # Send poison pills to all workers
            for _ in range(len(self.connections)):
                worker_index = self.queue.get()
                self.connections[worker_index].send(-1)
        except Exception as e:
            # TODO better send this to a common "something went wrong" Queue
            for connection in self.connections:
                connection.send(-2)
                connection.send((e, traceback.format_exc()))

コード例 #8

0

ファイルを表示

def test_append():
	cases = ["", ".gz"]
	if bz2 and sys.version_info > (3,):
		# BZ2 does NOT support append in Py 2.
		cases.append(".bz2")
	if lzma:
		cases.append(".xz")
	for ext in cases:
		# On Py3, need to send BYTES, not unicode. Let's do it for all.
		text = "AB".encode("utf-8")
		reference = text + text
		with temporary_path('truncated.fastq' + ext) as path:
			try:
				os.unlink(path)
			except OSError:
				pass
			with xopen(path, 'ab') as f:
				f.write(text)
			with xopen(path, 'ab') as f:
				f.write(text)
			with xopen(path, 'r') as f:
				for appended in f:
					pass
				try:
					reference = reference.decode("utf-8")
				except AttributeError:
					pass
				assert appended == reference

コード例 #9

0

ファイルを表示

 def fq_merge(self, fout, qlist):
     """
     Compress, multiple fastq files into single file
     """
     with xopen(fout, 'wb') as w:
         for q in qlist:
             with xopen(q, 'rb') as r:
                 shutil.copyfileobj(r, w)

コード例 #10

0

ファイルを表示

 def compress_output(self, f_in):
     """
     Compress f_in, save to self.outdir
     """
     f_out = os.path.join(self.outdir, os.path.basename(f_in) + '.gz')
     log.info('Saving file: {}'.format(f_out))
     # pigz faster than gzip
     with xopen(f_in, 'rb') as r:
         with xopen(f_out, 'wb') as w:
             shutil.copyfileobj(r, w)

コード例 #11

0

ファイルを表示

ファイル: fsr_precompute.py プロジェクト: TF-Chan-Lab/rG4-seeker

 def run(self, processes=8):
     with multiprocessing.Pool(processes) as pool:
         multiple_results = [pool.apply_async(i.run, args=()) for i in self.precompute_thread_list]
         [res.get() for res in multiple_results]
     for concat_fsrtsv, split_fsrtsv_list in self.concat_dict.items():
         with xopen(concat_fsrtsv, 'wb', compresslevel=9) as fw:
             for split_fsrtsv in split_fsrtsv_list:
                 with xopen(split_fsrtsv, 'rb') as f:
                     for line in f:
                         fw.write(line)
                 os.remove(split_fsrtsv)

コード例 #12

0

ファイルを表示

    def run_pe_bc(self):
        # step1. index1
        outdir1 = os.path.join(self.outdir, '_tmp')
        with xopen(self.fq1, 'rt') as r1, xopen(self.fq2, 'rt') as r2:
            self.barcode_pe(r1, r2, outdir1)
        # rename files in dirs/
        flist1 = self.wrap_dir(outdir1, mode='barcode')

        # save files
        self.wrap_read_count()
        self.wrap_file()

コード例 #13

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: pycompression/xopen

def test_append_text(ext, tmp_path):
    text = "AB"
    reference = text + text
    path = tmp_path / f"the-file{ext}"
    with xopen(path, "at") as f:
        f.write(text)
    with xopen(path, "at") as f:
        f.write(text)
    with xopen(path, "rt") as f:
        for appended in f:
            pass
        assert appended == reference

コード例 #14

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: condector/xopen

def test_append_text(ext, tmpdir):
    text = "AB"
    reference = text + text
    path = str(tmpdir.join("the-file" + ext))
    with xopen(path, "at") as f:
        f.write(text)
    with xopen(path, "at") as f:
        f.write(text)
    with xopen(path, "rt") as f:
        for appended in f:
            pass
        assert appended == reference

コード例 #15

0

ファイルを表示

ファイル: test_open.py プロジェクト: marcelm/dnaio

def test_write_with_xopen(tmp_path, fileformat, extension):
    s = dnaio.SequenceRecord('name', 'ACGT', 'HHHH')
    out_fastq = tmp_path / ("out." + fileformat + extension)
    with xopen(out_fastq, 'wb') as outer_f:
        with dnaio.open(outer_f, mode='w', fileformat=fileformat) as f:
            f.write(s)

    with xopen(out_fastq) as f:
        if fileformat == "fasta":
            assert f.read() == ">name\nACGT\n"
        else:
            assert f.read() == "@name\nACGT\n+\nHHHH\n"

コード例 #16

0

ファイルを表示

ファイル: adrsmlib.py プロジェクト: maxibor/adrsm

def write_fastq_multi(fastq_list, outputfile, compressed=True):
    if compressed:
        with xopen(outputfile + ".1.fastq.gz", "ab") as f1:
            with xopen(outputfile + ".2.fastq.gz", "ab") as f2:
                for read in fastq_list:
                    f1.write(read[0].encode())
                    f2.write(read[1].encode())
    else:
        with open(outputfile + ".1.fastq", "a") as f1:
            with open(outputfile + ".2.fastq", "a") as f2:
                for read in fastq_list:
                    f1.write(read[0])
                    f2.write(read[1])

コード例 #17

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: condector/xopen

def test_append(ext, tmpdir):
    text = b"AB"
    reference = text + text
    path = str(tmpdir.join("the-file" + ext))
    with xopen(path, "ab") as f:
        f.write(text)
    with xopen(path, "ab") as f:
        f.write(text)
    with xopen(path, "r") as f:
        for appended in f:
            pass
        reference = reference.decode("utf-8")
        assert appended == reference

コード例 #18

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: pycompression/xopen

def test_append(ext, tmp_path):
    text = b"AB"
    reference = text + text
    path = tmp_path / f"the-file{ext}"
    with xopen(path, "ab") as f:
        f.write(text)
    with xopen(path, "ab") as f:
        f.write(text)
    with xopen(path, "r") as f:
        for appended in f:
            pass
        reference = reference.decode("utf-8")
        assert appended == reference

コード例 #19

0

ファイルを表示

ファイル: pipeline.py プロジェクト: mervegozel/cutadapt

def reader_process(file, file2, connections, queue, buffer_size, stdin_fd):
    """
    Read chunks of FASTA or FASTQ data from *file* and send to a worker.

    queue -- a Queue of worker indices. A worker writes its own index into this
        queue to notify the reader that it is ready to receive more data.
    connections -- a list of Connection objects, one for each worker.

    The function repeatedly

    - reads a chunk from the file
    - reads a worker index from the Queue
    - sends the chunk to connections[index]

    and finally sends "poison pills" (the value -1) to all connections.
    """
    if stdin_fd != -1:
        sys.stdin.close()
        sys.stdin = os.fdopen(stdin_fd)
    try:
        with xopen(file, 'rb') as f:
            if file2:
                with xopen(file2, 'rb') as f2:
                    for chunk_index, (chunk1, chunk2) in enumerate(
                            dnaio.read_paired_chunks(f, f2, buffer_size)):
                        # Determine the worker that should get this chunk
                        worker_index = queue.get()
                        pipe = connections[worker_index]
                        pipe.send(chunk_index)
                        pipe.send_bytes(chunk1)
                        pipe.send_bytes(chunk2)
            else:
                for chunk_index, chunk in enumerate(
                        dnaio.read_chunks(f, buffer_size)):
                    # Determine the worker that should get this chunk
                    worker_index = queue.get()
                    pipe = connections[worker_index]
                    pipe.send(chunk_index)
                    pipe.send_bytes(chunk)

        # Send poison pills to all workers
        for _ in range(len(connections)):
            worker_index = queue.get()
            connections[worker_index].send(-1)
    except Exception as e:
        # TODO better send this to a common "something went wrong" Queue
        for worker_index in range(len(connections)):
            connections[worker_index].send(-2)
            connections[worker_index].send((e, traceback.format_exc()))

コード例 #20

0

ファイルを表示

ファイル: fasta.py プロジェクト: nunoalexandrefaria/bakta

def import_contigs(contigs_path):
    """Import raw contigs."""
    contigs = []
    # with contigs_path.open() as fh:
    with xopen(str(contigs_path), threads=0) as fh:
        for record in SeqIO.parse(fh, 'fasta'):
            seq = str(record.seq).upper()
            if (FASTA_DNA_SEQUENCE_PATTERN.fullmatch(seq) is None):
                log.error(
                    'import: Fasta sequence contains invalid DNA characters! id=%s'
                )
                raise ValueError(
                    f'Fasta sequence contains invalid DNA characters! id={record.id}'
                )
            contig = {
                'id': record.id,
                'description': record.description,
                'sequence': seq,
                'length': len(seq),
                'complete': False,
                'type': bc.REPLICON_CONTIG,
                'topology': bc.TOPOLOGY_LINEAR
            }
            log.info(
                'imported: id=%s, length=%i, complete=%s, topology=%s, description=%s',
                contig['id'], contig['length'], contig['complete'],
                contig['topology'], contig['description'])
            contigs.append(contig)
    return contigs

コード例 #21

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: pycompression/xopen

def test_has_iter_method(ext, tmp_path):
    path = tmp_path / f"out{ext}"
    with xopen(path, mode="w") as f:
        # Writing anything isn’t strictly necessary, but if we don’t, then
        # pbzip2 causes a delay of one second
        f.write("hello")
        assert hasattr(f, "__iter__")

コード例 #22

0

ファイルを表示

def check_haplotag_list_information(haplotag_list, exit_stack):
    """
    Check if the haplotag list file has at least 4 columns
    (assumed to be read name, haplotype, phaseset, chromosome),
    or at least 2 columns (as above). Fails if the haplotag file
    is not tab-separated. Return suitable parser for format

    :param haplotag_list: Tab-separated file with at least 2 or 4 columns
    :param exit_stack:
    :return:
    """
    haplo_list = exit_stack.enter_context(xopen(haplotag_list))
    first_line = haplo_list.readline().strip()
    # rewind to make sure a header-less file is processed correctly
    haplo_list.seek(0)
    has_chrom_info = False
    try:
        _, _, _, _ = first_line.split("\t")[:4]
        line_parser = _four_column_parser
    except ValueError:
        try:
            _, _ = first_line.split("\t")[:2]
            line_parser = _two_column_parser
        except ValueError:
            raise ValueError(
                "First line of haplotag list file does not have "
                "at least 2 columns, or it is not tab-separated: {}".format(
                    first_line))
    else:
        has_chrom_info = True
    return haplo_list, has_chrom_info, line_parser

コード例 #23

0

ファイルを表示

ファイル: seqio.py プロジェクト: youngjack86/cutadapt

 def __init__(self, file):
     if isinstance(file, str):
         self._file = xopen(file, 'w')
         self._close_on_exit = True
     else:
         self._file = file
         self._close_on_exit = False

コード例 #24

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: pycompression/xopen

def test_readinto(fname):
    content = CONTENT.encode("utf-8")
    with xopen(fname, "rb") as f:
        b = bytearray(len(content) + 100)
        length = f.readinto(b)
        assert length == len(content)
        assert b[:length] == content

コード例 #25

0

ファイルを表示

def identify(input_files: Tuple, output: os.PathLike = "duplicates.json"):
    """
    Identifies fragments with duplicated sequences.

    Merges the hashed dictionaries (in json format) generated by the "parse" subcommand and 
    identifies read with exactly the same sequence (share an identical hash). Duplicated read
    identifiers (hashed) are output in json format. The "remove" subcommand uses this dictionary
    to remove duplicates from fastq files.
    

    \f
    Args:
     input_files (Tuple): Paths to json files containing dictionaries with hashed read ids as the keys
                          and hashed sequences as the values.
     output (os.PathLike, optional): Duplicate read ids identified. Defaults to "duplicates.json".
    """

    dedup_sequences = dict()
    read_ids = set()

    np.random.shuffle(np.array(input_files))
    for fn in input_files:
        d = load_json(fn)  # {READ_NAME_HASH: SEQUENCE_HASH}
        read_ids.update(d)
        dedup_sequences.update(
            invert_dict(d))  # {SEQUENCE_HASH: READ_NAME_HASH}

    duplicated_ids = read_ids - set(dedup_sequences.values())
    del read_ids
    del dedup_sequences

    with xopen(output, "w") as w:
        duplicated_ids_dict = dict.fromkeys(duplicated_ids)
        ujson.dump(duplicated_ids_dict, w)

コード例 #26

0

ファイルを表示

def align_query_genome(config, dna_fragments_path, dna_fragments,
                       ref_genome_id):
    """Perform per-genome calculation of ANI/conserved DNA values.

    :param config: a global config object encapsulating global runtime vars
    :param dna_fragments: A dict comprising information on fragments.
    :param ref_genome_id: reference genome id.

    :rtype: A dict representing a reference genome and additionally comprising ANI / conserved DNA values.
    """

    tmp_dir = Path(tempfile.mkdtemp())
    reference_genome_zipped_path = config['db_path'].joinpath(
        f'{ref_genome_id}.fna.gz')
    reference_genome_path = tmp_dir.joinpath(f'{ref_genome_id}.fna')
    with reference_genome_path.open(mode='w') as fh_out, xopen(
            str(reference_genome_zipped_path), threads=0) as fh_in:
        for line in fh_in:
            fh_out.write(line)

    dna_fragment_matches = execute_nucmer(config, tmp_dir, dna_fragments,
                                          dna_fragments_path,
                                          reference_genome_path)

    shutil.rmtree(str(tmp_dir))

    ani = calculate_ani(dna_fragment_matches)
    conserved_dna = calculate_conserved_dna(dna_fragments,
                                            dna_fragment_matches)

    return (ref_genome_id, ani, conserved_dna)

コード例 #27

0

ファイルを表示

	def __init__(self, file):
		if isinstance(file, str):
			self._file = xopen(file, 'w')
			self._close_on_exit = True
		else:
			self._file = file
			self._close_on_exit = False

コード例 #28

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: pycompression/xopen

def test_override_output_format(tmp_path):
    path = tmp_path / "test_gzip_compressed"
    with xopen(path, mode="wb", format="gz") as f:
        f.write(b"test")
    test_contents = path.read_bytes()
    assert test_contents.startswith(b"\x1f\x8b")  # Gzip magic
    assert gzip.decompress(test_contents) == b"test"

コード例 #29

0

ファイルを表示

ファイル: test_xopen.py プロジェクト: pycompression/xopen

def test_truncated_iter(extension, create_truncated_file):
    truncated_file = create_truncated_file(extension)
    with pytest.raises((EOFError, IOError)):
        f = xopen(truncated_file, "r")
        for line in f:
            pass
        f.close()  # pragma: no cover

コード例 #30

0

ファイルを表示

	def test_truncated_gz():
		with temporary_path('truncated.gz') as path:
			create_truncated_file(path)
			with timeout(seconds=2):
				f = xopen(path, 'r')
				f.read()
				f.close()

コード例 #31

0

ファイルを表示

ファイル: test_open.py プロジェクト: marcelm/dnaio

def make_random_fasta(path, n_records):
    from random import choice
    with xopen(path, "w") as f:
        for i in range(n_records):
            name = "sequence_{}".format(i)
            sequence = "".join(choice("ACGT") for _ in range(300))
            print(">", name, "\n", sequence, sep="", file=f)

コード例 #32

0

ファイルを表示

ファイル: pipeline.py プロジェクト: marcelm/cutadapt

def reader_process(file, file2, connections, queue, buffer_size, stdin_fd):
    """
    Read chunks of FASTA or FASTQ data from *file* and send to a worker.

    queue -- a Queue of worker indices. A worker writes its own index into this
        queue to notify the reader that it is ready to receive more data.
    connections -- a list of Connection objects, one for each worker.

    The function repeatedly

    - reads a chunk from the file
    - reads a worker index from the Queue
    - sends the chunk to connections[index]

    and finally sends "poison pills" (the value -1) to all connections.
    """
    if stdin_fd != -1:
        sys.stdin.close()
        sys.stdin = os.fdopen(stdin_fd)
    try:
        with xopen(file, 'rb') as f:
            if file2:
                with xopen(file2, 'rb') as f2:
                    for chunk_index, (chunk1, chunk2) in enumerate(dnaio.read_paired_chunks(f, f2, buffer_size)):
                        # Determine the worker that should get this chunk
                        worker_index = queue.get()
                        pipe = connections[worker_index]
                        pipe.send(chunk_index)
                        pipe.send_bytes(chunk1)
                        pipe.send_bytes(chunk2)
            else:
                for chunk_index, chunk in enumerate(dnaio.read_chunks(f, buffer_size)):
                    # Determine the worker that should get this chunk
                    worker_index = queue.get()
                    pipe = connections[worker_index]
                    pipe.send(chunk_index)
                    pipe.send_bytes(chunk)

        # Send poison pills to all workers
        for _ in range(len(connections)):
            worker_index = queue.get()
            connections[worker_index].send(-1)
    except Exception as e:
        # TODO better send this to a common "something went wrong" Queue
        for worker_index in range(len(connections)):
            connections[worker_index].send(-2)
            connections[worker_index].send((e, traceback.format_exc()))

コード例 #33

0

ファイルを表示

	def __init__(self, file):
		"""
		file is a path or a file-like object. In both cases, the file may
		be compressed (.gz, .bz2, .xz).
		"""
		if isinstance(file, basestring):
			file = xopen(file)
			self._close_on_exit = True
		self._file = file

コード例 #34

0

ファイルを表示

	def __init__(self, file, colorspace=False, skip_color=0):
		"""
		file is a filename or a file-like object.
		If file is a filename, then .gz files are supported.

		colorspace -- Usually (when this is False), there must be n characters in the sequence and
		n quality values. When this is True, there must be n+1 characters in the sequence and n quality values.
		"""
		if isinstance(file, basestring):
			file = xopen(file, "r")
		self.fp = file
		self.colorspace = colorspace
		self.skip_color = skip_color
		self.twoheaders = False

コード例 #35

0

ファイルを表示

	def __init__(self, file, wholefile=False, keep_linebreaks=False):
		"""
		file is a filename or a file-like object.
		If file is a filename, then .gz files are supported.
		If wholefile is True, then it is ok to read the entire file
		into memory. This is faster when there are many newlines in
		the file, but may obviously need a lot of memory.
		keep_linebreaks -- whether to keep the newline characters in the sequence
		"""
		if isinstance(file, basestring):
			file = xopen(file, "r")
		self.fp = file
		self.wholefile = wholefile
		self.keep_linebreaks = keep_linebreaks
		assert not (wholefile and keep_linebreaks), "not supported"

コード例 #36

0

ファイルを表示

ファイル: fasta.py プロジェクト: azerxu/pyngs

def parse(fname):
    """Parse multi fasta records file and return a Fasta Object iterator"""
    name = ''
    seq = []
    handle = xopen(fname, 'r')
    for line in handle:
        line = line.strip()
        if not line:
            continue
        if line.startswith('>'):
            if name or seq:
                yield Fasta(name, ''.join(seq))
            name = line[1:]
            seq = []
        else:
            seq.append(line)

    if name or seq:
        yield Fasta(name, ''.join(seq))

コード例 #37

0

ファイルを表示

ファイル: qseq.py プロジェクト: azerxu/pyngs

def parse(qseqfile, fmt="I"):
    fmt = fmt.upper()
    handle = xopen(qseqfile, "rb")
    table_64_to_33 = phred64to33()
    for line in handle:
        line = line.strip()
        if not line:
            continue

        (mach, runid, lane, tile, x, y, index, readid, seq, qual, fil) = line.split("\t")

        # if fil value is 1 pass filter, 0 not
        fil = "N" if fil == "1" else "Y"

        if fmt in PHRED64_FORMAT:
            # trans phred64 quality to phred33 quality
            qual = qual.translate(table_64_to_33)

        name = "{0}:{1}:{2}:{3}:{4}:{5} {6}:{7}:{8}".format(mach, runid, lane, tile, x, y, readid, fil, index)
        yield Fastq(name, seq, qual)

コード例 #38

0

ファイルを表示

ファイル: fastq.py プロジェクト: azerxu/pyngs

def parse(fname, qtype='S'):
    """parse fastq file and return a iterator
    standard is a mark to show whether format to trans to standard
    """
    seq = ''
    qual = ''
    name = ''
    slen = qlen = 0

    if qtype in PHRED64_TYPE:
        need_trans = True
        trans = phred64to33
    else:
        need_trans = False

    is_seq_block = False                # True as Seq block, False Qual block

    handle = xopen(fname, 'r')
    # read head lines to check is or not fastq file
    for line in handle:
        line = line.rstrip()
        if not line or line.startswith('#'):
            continue
        if not line.startswith('@'):
            raise ValueError('{0} is not in fastq format'.format(fname))
        break                           # quit cycle

    # check is a empty fastq file or not
    if not line:
        return

    is_seq_block = True
    name = line[1:]

    for line in handle:
        line = line.rstrip()            # trim right endof \n \r
        if not line:                    # ignore blank line
            continue
        if is_seq_block:                # deal with seq block
            if line.startswith('+'):    # next is qual block
                is_seq_block = False
            else:                       # deal with seq block
                seq += line
                slen += len(line)
        else:                           # deal with quality block
            if qlen > slen:             # check qual length <= seq length
                raise ValueError('Error while Parsing {0}'.format(name))

            if line.startswith('@'):    # switch to sequence block
                # at beginning of next fastq
                if seq and slen == qlen:
                    if need_trans:
                        qual = trans(qual)
                    yield Fastq(name, seq, qual)
                    seq = ''
                    qual = ''
                    name = line[1:]
                    is_seq_block = True    # next is seq block
                elif not seq and not qual: # start to generate fastq
                    name = line[1:]
                    is_seq_block = True    # next is seq block
                else:                      # just a qual line begin with @
                    qual += line           # renew quality value
                    qlen += len(line)
            else:
                qual += line
                qlen += len(line)

    # yield last fastq record
    if name or seq:
        if slen != qlen:                    # check the last fastq record
            raise ValueError('parsing wrong with {0}'.format(name))

        if need_trans:                  # trans qual
            qual = trans(qual)

        yield Fastq(name, seq, qual)

コード例 #39

0

ファイルを表示

def main(cmdlineargs=None, default_outfile=sys.stdout):
	"""
	Main function that evaluates command-line parameters and iterates
	over all reads.

	default_outfile is the file to which trimmed reads are sent if the ``-o``
	parameter is not used.
	"""
	parser = get_option_parser()
	if cmdlineargs is None:
		cmdlineargs = sys.argv[1:]
	options, args = parser.parse_args(args=cmdlineargs)
	# Setup logging only if there are not already any handlers (can happen when
	# this function is being called externally such as from unit tests)
	if not logging.root.handlers:
		setup_logging(stdout=bool(options.output), quiet=options.quiet)

	if len(args) == 0:
		parser.error("At least one parameter needed: name of a FASTA or FASTQ file.")
	elif len(args) > 2:
		parser.error("Too many parameters.")
	input_filename = args[0]
	if input_filename.endswith('.qual'):
		parser.error("If a .qual file is given, it must be the second argument.")

	# Find out which 'mode' we need to use.
	# Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given)
	paired = False
	if options.paired_output:
		# Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U).
		# This exists for backwards compatibility ('legacy mode').
		paired = 'first'
	# Any of these options switch off legacy mode
	if (options.adapters2 or options.front2 or options.anywhere2 or
			options.cut2 or options.interleaved or options.pair_filter or
			options.too_short_paired_output or options.too_long_paired_output):
		# Full paired-end trimming when both -p and -A/-G/-B/-U given
		# Read modifications (such as quality trimming) are applied also to second read.
		paired = 'both'

	if paired and len(args) == 1 and not options.interleaved:
		parser.error("When paired-end trimming is enabled via -A/-G/-B/-U/"
			"--interleaved or -p, two input files are required.")
	if not paired:
		if options.untrimmed_paired_output:
			parser.error("Option --untrimmed-paired-output can only be used when "
				"trimming paired-end reads (with option -p).")

	interleaved_input = False
	interleaved_output = False
	if options.interleaved:
		interleaved_input = len(args) == 1
		interleaved_output = not options.paired_output
		if not interleaved_input and not interleaved_output:
			parser.error("When --interleaved is used, you cannot provide both two input files and two output files")

	# Assign input_paired_filename and quality_filename
	input_paired_filename = None
	quality_filename = None
	if paired:
		if not interleaved_input:
			input_paired_filename = args[1]
		if not interleaved_output:
			if not options.paired_output:
				parser.error("When paired-end trimming is enabled via -A/-G/-B/-U, "
					"a second output file needs to be specified via -p (--paired-output).")
			if not options.output:
				parser.error("When you use -p or --paired-output, you must also "
					"use the -o option.")

		if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output):
			parser.error("When trimming paired-end reads, you must use either none "
				"or both of the --untrimmed-output/--untrimmed-paired-output options.")
		if options.too_short_output and not options.too_short_paired_output:
			parser.error("When using --too-short-output with paired-end "
				"reads, you also need to use --too-short-paired-output")
		if options.too_long_output and not options.too_long_paired_output:
			parser.error("When using --too-long-output with paired-end "
				"reads, you also need to use --too-long-paired-output")
	elif len(args) == 2:
		quality_filename = args[1]
		if options.format is not None:
			parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.")

	if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']:
		parser.error("The input file format must be either 'fasta', 'fastq' or "
			"'sra-fastq' (not '{0}').".format(options.format))

	# Open input file(s)
	try:
		reader = seqio.open(input_filename, file2=input_paired_filename,
				qualfile=quality_filename, colorspace=options.colorspace,
				fileformat=options.format, interleaved=interleaved_input)
	except (seqio.UnknownFileType, IOError) as e:
		parser.error(e)

	if options.quality_cutoff is not None:
		cutoffs = options.quality_cutoff.split(',')
		if len(cutoffs) == 1:
			try:
				cutoffs = [0, int(cutoffs[0])]
			except ValueError as e:
				parser.error("Quality cutoff value not recognized: {0}".format(e))
		elif len(cutoffs) == 2:
			try:
				cutoffs = [int(cutoffs[0]), int(cutoffs[1])]
			except ValueError as e:
				parser.error("Quality cutoff value not recognized: {0}".format(e))
		else:
			parser.error("Expected one value or two values separated by comma for the quality cutoff")
	else:
		cutoffs = None

	open_writer = functools.partial(seqio.open, mode='w',
		qualities=reader.delivers_qualities, colorspace=options.colorspace)

	if options.pair_filter is None:
		options.pair_filter = 'any'
	min_affected = 2 if options.pair_filter == 'both' else 1
	if not paired:
		filter_wrapper = Redirector
	elif paired == 'first':
		filter_wrapper = LegacyPairedRedirector
	elif paired == 'both':
		filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected)
	filters = []
	# TODO open_files = []
	too_short_writer = None  # too short reads go here
	# TODO pass file name to TooShortReadFilter, add a .close() method?
	if options.minimum_length > 0:
		if options.too_short_output:
			too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output)
		filters.append(filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length)))
	too_long_writer = None  # too long reads go here
	if options.maximum_length < sys.maxsize:
		if options.too_long_output is not None:
			too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output)
		filters.append(filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length)))

	if options.max_n != -1:
		filters.append(filter_wrapper(None, NContentFilter(options.max_n)))

	if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(options.untrimmed_output is not None) > 1:
		parser.error("Only one of the --discard-trimmed, --discard-untrimmed "
			"and --untrimmed-output options can be used at the same time.")
	demultiplexer = None
	untrimmed_writer = None
	writer = None
	if options.output is not None and '{name}' in options.output:
		if options.discard_trimmed:
			parser.error("Do not use --discard-trimmed when demultiplexing.")
		if paired:
			parser.error("Demultiplexing not supported for paired-end files, yet.")
		untrimmed = options.output.replace('{name}', 'unknown')
		if options.untrimmed_output:
			untrimmed = options.untrimmed_output
		if options.discard_untrimmed:
			untrimmed = None
		demultiplexer = Demultiplexer(options.output, untrimmed,
			qualities=reader.delivers_qualities, colorspace=options.colorspace)
		filters.append(demultiplexer)
	else:
		# Set up the remaining filters to deal with --discard-trimmed,
		# --discard-untrimmed and --untrimmed-output. These options
		# are mutually exclusive in order to avoid brain damage.
		if options.discard_trimmed:
			filters.append(filter_wrapper(None, DiscardTrimmedFilter()))
		elif options.discard_untrimmed:
			filters.append(filter_wrapper(None, DiscardUntrimmedFilter()))
		elif options.untrimmed_output:
			untrimmed_writer = open_writer(options.untrimmed_output,
				options.untrimmed_paired_output)
			filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter()))

		# Finally, figure out where the reads that passed all the previous
		# filters should go.
		if options.output is not None:
			writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output)
		else:
			writer = open_writer(default_outfile, interleaved=interleaved_output)
		if not paired:
			filters.append(NoFilter(writer))
		else:
			filters.append(PairedNoFilter(writer))

	if options.maq:
		options.colorspace = True
		options.double_encode = True
		options.trim_primer = True
		options.strip_suffix.append('_F3')
		options.suffix = "/1"
	if options.zero_cap is None:
		options.zero_cap = options.colorspace
	if options.trim_primer and not options.colorspace:
		parser.error("Trimming the primer makes only sense in colorspace.")
	if options.double_encode and not options.colorspace:
		parser.error("Double-encoding makes only sense in colorspace.")
	if options.anywhere and options.colorspace:
		parser.error("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).")
	if not (0 <= options.error_rate <= 1.):
		parser.error("The maximum error rate must be between 0 and 1.")
	if options.overlap < 1:
		parser.error("The overlap must be at least 1.")

	if options.rest_file is not None:
		options.rest_file = xopen(options.rest_file, 'w')
		rest_writer = RestFileWriter(options.rest_file)
	else:
		rest_writer = None
	if options.info_file is not None:
		options.info_file = xopen(options.info_file, 'w')
	if options.wildcard_file is not None:
		options.wildcard_file = xopen(options.wildcard_file, 'w')

	if options.colorspace:
		if options.match_read_wildcards:
			parser.error('IUPAC wildcards not supported in colorspace')
		options.match_adapter_wildcards = False

	adapter_parser = AdapterParser(
		colorspace=options.colorspace,
		max_error_rate=options.error_rate,
		min_overlap=options.overlap,
		read_wildcards=options.match_read_wildcards,
		adapter_wildcards=options.match_adapter_wildcards,
		indels=options.indels)

	try:
		adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front)
		adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2)
	except IOError as e:
		if e.errno == errno.ENOENT:
			parser.error(e)
		raise
	except ValueError as e:
		parser.error(e)
	if options.debug:
		for adapter in adapters + adapters2:
			adapter.enable_debug()

	# Create the single-end processing pipeline (a list of "modifiers")
	modifiers = []
	if options.cut:
		if len(options.cut) > 2:
			parser.error("You cannot remove bases from more than two ends.")
		if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
			parser.error("You cannot remove bases from the same end twice.")
		for cut in options.cut:
			if cut != 0:
				modifiers.append(UnconditionalCutter(cut))

	if options.nextseq_trim is not None:
		modifiers.append(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base))

	if cutoffs:
		modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
	if adapters:
		adapter_cutter = AdapterCutter(adapters, options.times,
				options.wildcard_file, options.info_file,
				rest_writer, options.action)
		modifiers.append(adapter_cutter)

	# Modifiers that apply to both reads of paired-end reads unless in legacy mode
	modifiers_both = []
	if options.length is not None:
		modifiers_both.append(Shortener(options.length))
	if options.trim_n:
		modifiers_both.append(NEndTrimmer())
	if options.length_tag:
		modifiers_both.append(LengthTagModifier(options.length_tag))
	if options.strip_f3:
		options.strip_suffix.append('_F3')
	for suffix in options.strip_suffix:
		modifiers_both.append(SuffixRemover(suffix))
	if options.prefix or options.suffix:
		modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix))
	if options.double_encode:
		modifiers_both.append(DoubleEncoder())
	if options.zero_cap and reader.delivers_qualities:
		modifiers_both.append(ZeroCapper(quality_base=options.quality_base))
	if options.trim_primer:
		modifiers_both.append(PrimerTrimmer)
	modifiers.extend(modifiers_both)

	# For paired-end data, create a second processing pipeline.
	# However, if no second-read adapters were given (via -A/-G/-B/-U), we need to
	# be backwards compatible and *no modifications* are done to the second read.
	modifiers2 = []
	if paired == 'both':
		if options.cut2:
			if len(options.cut2) > 2:
				parser.error("You cannot remove bases from more than two ends.")
			if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0:
				parser.error("You cannot remove bases from the same end twice.")
			for cut in options.cut2:
				if cut != 0:
					modifiers2.append(UnconditionalCutter(cut))

		if cutoffs:
			modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
		if adapters2:
			adapter_cutter2 = AdapterCutter(adapters2, options.times,
					None, None, None, options.action)
			modifiers2.append(adapter_cutter2)
		else:
			adapter_cutter2 = None
		modifiers2.extend(modifiers_both)

	if paired:
		pipeline = PairedEndPipeline(reader, modifiers, modifiers2, filters)
	else:
		pipeline = SingleEndPipeline(reader, modifiers, filters)

	logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version())
	logger.info("Command line parameters: %s", " ".join(cmdlineargs))
	logger.info("Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
		len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '',
		options.error_rate * 100,
		{ False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired])

	if paired == 'first' and (modifiers_both or cutoffs):
		logger.warning('\n'.join(textwrap.wrap('WARNING: Requested read '
			'modifications are applied only to the first '
			'read since backwards compatibility mode is enabled. '
			'To modify both reads, also use any of the -A/-B/-G/-U options. '
			'Use a dummy adapter sequence when necessary: -A XXX')))

	start_time = time.clock()
	try:
		stats = pipeline.run()
	except KeyboardInterrupt as e:
		print("Interrupted", file=sys.stderr)
		sys.exit(130)
	except IOError as e:
		if e.errno == errno.EPIPE:
			sys.exit(1)
		raise
	except (seqio.FormatError, EOFError) as e:
		sys.exit("cutadapt: error: {0}".format(e))

	# close open files
	for f in [writer, untrimmed_writer,
			options.rest_file, options.wildcard_file,
			options.info_file, too_short_writer, too_long_writer,
			options.info_file, demultiplexer]:
		if f is not None and f is not sys.stdin and f is not sys.stdout:
			f.close()

	elapsed_time = time.clock() - start_time
	if not options.quiet:
		stats.collect((adapters, adapters2), elapsed_time,
			modifiers, modifiers2, filters)
		# send statistics to stderr if result was sent to stdout
		stat_file = sys.stderr if options.output is None else None
		with redirect_standard_output(stat_file):
			print_report(stats, (adapters, adapters2))

コード例 #40

0

ファイルを表示

ファイル: insta-scrap.py プロジェクト: vishalyadao786/My_Git_Repository

from xopen import xopen
import glob
import json
import pandas as pd
import numpy

files = glob.glob("colorsinart_/2018*xz") #load all files in directory to be parsed through the xopen read
content = ""
counter = 1
list_item = []
list_header = ['description', 'date_posted', 'likes', 'comments', 'post_id', 'username', 'is_connected_fb', 'is_video']

for file in files:
    with xopen(file) as f:
        if counter == 1: #identify first record
            content = content + "[" + str(f.read()) + ", \n"
            counter += 1
        elif counter == len(files): #identify last record
            content = content + str(f.read()) + "]"
            counter += 1
        else:
            content = content + str(f.read()) + ", \n"
            counter += 1

#convert string to listed dict format
data = json.loads(content)

#extract the required metric from the json data
for data in data:
    list_item += [(data['node']['edge_media_to_caption']['edges'][0]['node']['text'],
                datetime.datetime.fromtimestamp(data['node']['taken_at_timestamp']).strftime('%Y-%m-%d %H:%M:%S'), #transform the date format from ms

コード例 #41

0

ファイルを表示

def main(cmdlineargs=None):
	"""Main function that evaluates command-line parameters and contains the main loop over all reads."""
	parser = HelpfulOptionParser(usage=__doc__, version=__version__)

	parser.add_option("-f", "--format", default=None,
		help="Input file format; can be either 'fasta', 'fastq' or 'sra-fastq'. "
		"Ignored when reading csfasta/qual files (default: auto-detect from file name extension).")

	group = OptionGroup(parser, "Options that influence how the adapters are found",
		description="Each of the following three parameters (-a, -b, -g) can be used " +\
			"multiple times and in any combination to search for an entire set of " + \
			"adapters of possibly different types. All of the "+\
			"given adapters will be searched for in each read, but only the best "+\
			"matching one will be trimmed (but see the --times option).")
	group.add_option("-a", "--adapter", action="append", metavar="ADAPTER", dest="adapters", default=[],
		help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed.")
	group.add_option("-b", "--anywhere", action="append", metavar="ADAPTER", default=[],
		help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept.")
	group.add_option("-g", "--front", action="append", metavar="ADAPTER", default=[],
		help="Sequence of an adapter that was ligated to the 5' end. If the " + \
		"adapter sequence starts with the character '^', the adapter is " + \
		"'anchored'. An anchored adapter must appear in its entirety at the " + \
		"5' end of the read (it is a prefix of the read). A non-anchored adapter may " + \
		"appear partially at the 5' end, or it may occur within the read. If it is " + \
		"found within a read, the sequence preceding the adapter is also trimmed. " + \
		"In all cases the adapter itself is trimmed.")
	group.add_option("-e", "--error-rate", type=float, default=0.1,
		help="Maximum allowed error rate (no. of errors divided by the length of the matching region) (default: %default)")
	group.add_option("-n", "--times", type=int, metavar="COUNT", default=1,
		help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times (default: %default).")
	group.add_option("-O", "--overlap", type=int, metavar="LENGTH", default=3,
		help="Minimum overlap length. If the overlap between the read and the adapter is shorter than LENGTH, the read is not modified."
			"This reduces the no. of bases trimmed purely due to short random adapter matches (default: %default).")
	group.add_option("--match-read-wildcards", action="store_true", default=False,
		help="Allow 'N's in the read as matches to the adapter (default: %default).")
	group.add_option("-N", "--no-match-adapter-wildcards", action="store_false",
		default=True, dest='match_adapter_wildcards',
		help="Do not treat 'N' in the adapter sequence as wildcards. This is needed when you want to search for literal 'N' characters.")
	parser.add_option_group(group)

	group = OptionGroup(parser, "Options for filtering of processed reads")
	group.add_option("--discard-trimmed", "--discard", action='store_true', default=False,
		help="Discard reads that contain the adapter instead of trimming them. Also use -O in order to avoid throwing away too many randomly matching reads!")
	group.add_option("-m", "--minimum-length", type=int, default=0, metavar="LENGTH",
		help="Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted (default: 0).")
	group.add_option("-M", "--maximum-length", type=int, default=sys.maxsize, metavar="LENGTH",
		help="Discard trimmed reads that are longer than LENGTH. "
			"Reads that are too long even before adapter removal "
			"are also discarded. In colorspace, an initial primer "
			"is not counted (default: no limit).")
	parser.add_option_group(group)

	group = OptionGroup(parser, "Options that influence what gets output to where")
	group.add_option("-o", "--output", default=None, metavar="FILE",
		help="Write the modified sequences to this file instead of standard output and send the summary report to standard output. "
		     "The format is FASTQ if qualities are available, FASTA otherwise. (default: standard output)")
	group.add_option("-r", "--rest-file", default=None, metavar="FILE",
		help="When the adapter matches in the middle of a read, write the rest (after the adapter) into a file. Use - for standard output.")
	group.add_option("--wildcard-file", default=None, metavar="FILE",
		help="When the adapter has wildcard bases ('N's) write adapter bases matching wildcard "
		     "positions to FILE.  Use - for standard output.")
	group.add_option("--too-short-output", default=None, metavar="FILE",
		help="Write reads that are too short (according to length specified by -m) to FILE. (default: discard reads)")
	group.add_option("--untrimmed-output", default=None, metavar="FILE",
		help="Write reads that do not contain the adapter to FILE, instead "
			"of writing them to the regular output file. (default: output "
			"to same file as trimmed)")
	parser.add_option_group(group)

	group = OptionGroup(parser, "Additional modifications to the reads")
	group.add_option("-q", "--quality-cutoff", type=int, default=None, metavar="CUTOFF",
		help="Trim low-quality ends from reads before adapter removal. "
			"The algorithm is the same as the one used by BWA "
			"(Subtract CUTOFF from all qualities; "
			"compute partial sums from all indices to the end of the "
			"sequence; cut sequence at the index at which the sum "
			"is minimal) (default: %default)")
	group.add_option("--quality-base", type=int, default=33,
		help="Assume that quality values are encoded as ascii(quality + QUALITY_BASE). The default (33) is usually correct, "
		     "except for reads produced by some versions of the Illumina pipeline, where this should be set to 64. (default: %default)")
	group.add_option("-x", "--prefix", default='',
		help="Add this prefix to read names")
	group.add_option("-y", "--suffix", default='',
		help="Add this suffix to read names")
	group.add_option("-c", "--colorspace", action='store_true', default=False,
		help="Colorspace mode: Also trim the color that is adjacent to the found adapter.")
	group.add_option("-d", "--double-encode", action='store_true', default=False,
		help="When in color space, double-encode colors (map 0,1,2,3,4 to A,C,G,T,N).")
	group.add_option("-t", "--trim-primer", action='store_true', default=False,
		help="When in color space, trim primer base and the first color "
			"(which is the transition to the first nucleotide)")
	group.add_option("--strip-f3", action='store_true', default=False,
		help="For color space: Strip the _F3 suffix of read names")
	group.add_option("--maq", "--bwa", action='store_true', default=False,
		help="MAQ- and BWA-compatible color space output. This enables -c, -d, -t, --strip-f3, -y '/1' and -z.")
	group.add_option("--length-tag", default=None, metavar="TAG",
		help="Search for TAG followed by a decimal number in the name of the read "
			"(description/comment field of the FASTA or FASTQ file). Replace the "
			"decimal number with the correct length of the trimmed read. "
			"For example, use --length-tag 'length=' to search for fields "
			"like 'length=123'.")
	group.add_option("--zero-cap", "-z", action='store_true', default=False,
		help="Change negative quality values to zero (workaround to avoid segmentation faults in BWA)")
	parser.add_option_group(group)

	options, args = parser.parse_args(args=cmdlineargs)

	if len(args) == 0:
		parser.error("At least one parameter needed: name of a FASTA or FASTQ file.")
	elif len(args) > 2:
		parser.error("Too many parameters.")

	input_filename = args[0]
	quality_filename = None
	if len(args) == 2:
		quality_filename = args[1]
	if input_filename.endswith('.qual') and quality_filename.endswith('fasta'):
		parser.error("FASTA and QUAL file given, but the FASTA file must be first.")

	if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']:
		parser.error("The input file format must be either 'fasta', 'fastq' or 'sra-fastq' (not '{0}').".format(options.format))

	# TODO should this really be an error?
	if options.format is not None and quality_filename is not None:
		parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.")

	# default output files (overwritten below)
	trimmed_outfile = sys.stdout # reads with adapters go here
	too_short_outfile = None # too short reads go here
	#too_long_outfile = None # too long reads go here

	if options.output is not None:
		trimmed_outfile = xopen(options.output, 'w')
	untrimmed_outfile = trimmed_outfile # reads without adapters go here
	if options.untrimmed_output is not None:
		untrimmed_outfile = xopen(options.untrimmed_output, 'w')
	if options.too_short_output is not None:
		too_short_outfile = xopen(options.too_short_output, 'w')
	#if options.too_long_output is not None:
		#too_long_outfile = xopen(options.too_long_output, 'w')

	if options.maq:
		options.colorspace = True
		options.double_encode = True
		options.trim_primer = True
		options.strip_f3 = True
		options.suffix = "/1"
		options.zero_cap = True
	if options.trim_primer and not options.colorspace:
		parser.error("Trimming the primer makes only sense in color space.")
	if options.double_encode and not options.colorspace:
		parser.error("Double-encoding makes only sense in color space.")
	if options.colorspace and options.front and not options.trim_primer:
		parser.error("Currently, when you want to trim a 5' adapter in colorspace, you must also specify the --trim-primer option")
	if options.anywhere and options.colorspace:
		parser.error("Using --anywhere with color space reads is currently not supported  (if you think this may be useful, contact the author).")
	if not (0 <= options.error_rate <= 1.):
		parser.error("The maximum error rate must be between 0 and 1.")
	if options.overlap < 1:
		parser.error("The overlap must be at least 1.")

	if options.rest_file is not None:
		options.rest_file = xopen(options.rest_file, 'w')
	if options.wildcard_file is not None:
		options.wildcard_file = xopen(options.wildcard_file, 'w')

	adapters = []

	def append_adapters(adapter_list, where):
		for seq in adapter_list:
			seq = seq.strip()
			w = where
			if w == FRONT and seq.startswith('^'):
				seq = seq[1:]
				w = PREFIX
			adapters.append(Adapter(seq, w, options.error_rate,
				options.overlap, options.match_read_wildcards,
				options.colorspace,
				options.match_adapter_wildcards,
				options.wildcard_file,
				options.rest_file))

	append_adapters(options.adapters, BACK)
	append_adapters(options.anywhere, ANYWHERE)
	append_adapters(options.front, FRONT)


	# make sure these aren't used by accident
	del options.adapters
	del options.anywhere
	del options.front

	if not adapters and options.quality_cutoff is None:
		print("You need to provide at least one adapter sequence.", file=sys.stderr)
		return 1

	#total_bases = 0
	#total_quality_trimmed = 0

	modifiers = []
	if options.length_tag:
		modifiers.append(LengthTagModifier(options.length_tag))
	if options.strip_f3:
		modifiers.append(SuffixRemover('_F3'))
	if options.prefix or options.suffix:
		modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix))
	if options.double_encode:
		modifiers.append(DoubleEncoder())
	if options.zero_cap:
		modifiers.append(ZeroCapper(quality_base=options.quality_base))

	cutter = AdapterCutter(adapters, options.times, options.rest_file,
				options.colorspace, options.wildcard_file)
	readfilter = ReadFilter(options.minimum_length, options.maximum_length,
		too_short_outfile, options.discard_trimmed, cutter.stats) # TODO stats?
	try:
		twoheaders = None
		reader = read_sequences(input_filename, quality_filename, colorspace=options.colorspace, fileformat=options.format)
		for read in reader:
			# In colorspace, the first character is the last nucleotide of the primer base
			# and the second character encodes the transition from the primer base to the
			# first real base of the read.
			if options.trim_primer:
				read.sequence = read.sequence[2:]
				if read.qualities is not None: # TODO
					read.qualities = read.qualities[1:]
				initial = ''
			elif options.colorspace:
				initial = read.sequence[0]
				read.sequence = read.sequence[1:]
			else:
				initial = ''

			#total_bases += len(qualities)
			if options.quality_cutoff is not None:
				index = quality_trim_index(read.qualities, options.quality_cutoff, options.quality_base)
				read = read[:index]

			read, trimmed = cutter.cut(read)
			for modifier in modifiers:
				read = modifier.apply(read)
			if twoheaders is None:
				try:
					twoheaders = reader.twoheaders
				except AttributeError:
					twoheaders = False
			if readfilter.keep(read, trimmed):
				read.sequence = initial + read.sequence
				try:
					write_read(read, trimmed_outfile if trimmed else untrimmed_outfile, twoheaders)
				except IOError as e:
					if e.errno == errno.EPIPE:
						return 1
					raise
	except seqio.FormatError as e:
		print("Error:", e, file=sys.stderr)
		return 1
	if options.rest_file is not None:
		options.rest_file.close()
	if options.wildcard_file is not None:
		options.wildcard_file.close()
	# send statistics to stderr if result was sent to stdout
	stat_file = sys.stderr if options.output is None else None
	cutter.stats.print_statistics(options.error_rate, file=stat_file)

	return 0