def testNbSeq(self): nb_seq = FastaIO.nbSeq(self.tmp_mono_line) self.assertEqual(4, nb_seq) nb_seq = FastaIO.nbSeq(self.tmp_multi_line) self.assertEqual(4, nb_seq) nb_seq = FastaIO.nbSeq(self.tmp_multi_line_gz) self.assertEqual(4, nb_seq)
def writeTargetReads(out_R1_path, out_R2_path, reads_pairs): """ """ with FastaIO(out_R1_path, "a") as FH_out_R1: with FastaIO(out_R2_path, "a") as FH_out_R2: for R1, R2 in reads_pairs: FH_out_R1.write(R1) FH_out_R2.write(R2)
def addStartTag(in_path, out_path): FH_in = FastaIO(in_path) FH_out = FastaIO(out_path, "w") for record in FH_in: record.string = "^" + record.string FH_out.write(record) FH_in.close() FH_out.close()
def testNbSeqAndNt(self): nb_seq, nb_nt = FastaIO.nbSeqAndNt(self.tmp_mono_line) self.assertEqual(nb_seq, 4) self.assertEqual(nb_nt, 104) nb_seq, nb_nt = FastaIO.nbSeqAndNt(self.tmp_multi_line) self.assertEqual(nb_seq, 4) self.assertEqual(nb_nt, 104) nb_seq, nb_nt = FastaIO.nbSeqAndNt(self.tmp_multi_line_gz) self.assertEqual(nb_seq, 4) self.assertEqual(nb_nt, 104)
def testIsValid(self): # Valid self.assertTrue(FastaIO.isValid(self.tmp_mono_line)) self.assertTrue(FastaIO.isValid(self.tmp_multi_line)) self.assertTrue(FastaIO.isValid(self.tmp_multi_line_gz)) # Valid long file content = ">seq1\nATGC\n>seq2\nATGC\n>seq3\nATGC\n>seq4\nATGC\n>seq5\nATGC\n>seq6\nATGC\n>seq7\nATGC\n>seq8\nATGC\n>seq9\nATGC\n>seq10\nATGC\n>seq11\nATGC\n>seq12\nATGC" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(FastaIO.isValid(self.tmp_out)) # Valid empty file content = "" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(FastaIO.isValid(self.tmp_out)) # Valid empty sequence content = ">seq1\n" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(FastaIO.isValid(self.tmp_out)) # Invalid file (two headers) content = ">seq1\nATGC\n>seq2\n>seq3\nATGC" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(not FastaIO.isValid(self.tmp_out)) # Invalid file (no header at the first line) content = "seq1\nATGC\n>seq2\nATGC" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(not FastaIO.isValid(self.tmp_out)) # Invalid file (fastq) content = "@seq1\nATGC\n+\n####" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(not FastaIO.isValid(self.tmp_out))
def getSeqByChr(genome_path): """ Return by chromosome name the sequence of this chromosome. :param genome_path: Path to the genome file (format: fasta). :type genome_path: str :return: By chromosome name the sequence of this chromosome in uppercase. :rtype: dict """ genome_by_chr = dict() FH_seq = FastaIO(genome_path) for record in FH_seq: genome_by_chr[record.id] = record.string.upper() FH_seq.close() return genome_by_chr
def getChrSeq(in_ref, chrom_id): """ """ chrom_seq = None with FastaIO(in_ref) as FH_seq: for record in FH_seq: if record.id == chrom_id: chrom_seq = record.string return chrom_seq
def testIter(self): with FastaIO(self.tmp_mono_line) as FH_in: for idx, record in enumerate(FH_in): self.assertTrue(cmpSequences(record, self.expected_rec[idx])) self.assertEqual(idx + 1, 4) with FastaIO(self.tmp_multi_line) as FH_in: for idx, record in enumerate(FH_in): self.assertTrue(cmpSequences(record, self.expected_rec[idx])) self.assertEqual(idx + 1, 4) with FastaIO(self.tmp_multi_line_gz) as FH_in: for idx, record in enumerate(FH_in): self.assertTrue(cmpSequences(record, self.expected_rec[idx])) self.assertEqual(idx + 1, 4) with FastaIO(self.tmp_mono_line) as FH_in_mono: with FastaIO(self.tmp_multi_line) as FH_in_multi: idx = 0 for rec_expected, rec_mono_line, rec_multi_line in zip( self.expected_rec, FH_in_mono, FH_in_multi): self.assertTrue(cmpSequences(rec_mono_line, rec_expected)) self.assertTrue(cmpSequences(rec_multi_line, rec_expected)) idx += 1 self.assertEqual(idx, 4)
def getSeqRecord(in_seq, selected_id): """ @summary: Returns the selected sequence object from the sequences file. @param in_seq: [str] Path to the sequences file (format: fasta). @param selected_id: [str] The ID of the selected sequence. @return: [Sequence] The selected sequence object. """ selected_record = None with FastaIO(in_seq) as FH_in: for record in FH_in: if record.id == selected_id: selected_record = record return selected_record
def getChromSeq(chrom_name, in_fasta): """ Return the sequence corresponding to the chromosome. :param chrom_name: The name of the selected chromosome. :type chrom_name: str :param in_fasta: The path to the file sequences file (format: fasta). :type in_fasta: str :return: the sequence corresponding to the chromosome. :rtype: str """ seq = None with FastaIO(in_fasta) as FH_ref: for record in FH_ref: if record.id == chrom_name: seq = record.string if seq is None: raise Exception( 'The chromosome "{}" cannot be rertrieved from "{}".'.format( chrom_name, in_fasta)) return seq
def getBEDRecords(ref_path, amplicons): for ampl in amplicons: ampl["found"] = False bed_ampl = [] with FastaIO(ref_path) as FH_seq: for record in FH_seq: chr_id = record.id chr_str = record.string.upper() for ampli in amplicons: # Primers are on strand + up_primer = ampli["f_primer"].upper() down_primer = ampli["r_primer"].upper() start, end = findPosOnSequence(chr_id, chr_str, up_primer, down_primer) if start is not None: ampli["found"] = True bed_ampl.append( BEDRecord(chr_id, start, end, ampli["name"], 0, "+", start + len(up_primer), end - len(down_primer))) # Primers are on strand - up_primer = revcom(ampli["r_primer"].upper()) down_primer = revcom(ampli["f_primer"].upper()) start, end = findPosOnSequence(chr_id, chr_str, up_primer, down_primer) if start is not None: ampli["found"] = True bed_ampl.append( BEDRecord(chr_id, start, end, ampli["name"], 0, "-", start + len(up_primer), end - len(down_primer))) for ampl in amplicons: if not ampl["found"]: warnings.warn( 'The amplicons {} with primers fwd:{}, rvs:{} cannot be found in {}.' .format(ampl["name"], ampl["f_primer"], ampl["r_primer"], ref_path)) return (bed_ampl)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFPrimers.py", "--input-variants", self.tmp_variants, "--input-regions", self.tmp_regions, "--input-sequences", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: FH_seq.write(Sequence("artificial_chr1", "NNNAAAATTTGGGGGGGGGGTTTAAANNN")) # 123456789| | | | | | | | | | # 10| 14| 18| 22| 26| # 12 16 20 24 28 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = {"ZOI": HeaderInfoAttr("ZOI", "If the variant can be in interest area.", type="String", number="1")} FH_var.writeHeader() self.variants = [ VCFRecord("artificial_chr1", 6, "alt_0", "A", ["AA"], None, None, {"ZOI": "no"}), VCFRecord("artificial_chr1", 8, "alt_1", "TT", ["T"], None, None, {"ZOI": "no"}), VCFRecord("artificial_chr1", 8, "alt_2", "T", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 9, "alt_3", "TTGG", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 14, "alt_4", "G", ["GG"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 18, "alt_5", "GGG", ["G"], None, None, {"ZOI": "yes"}), # ZOI downstream limit deletion VCFRecord("artificial_chr1", 22, "alt_6", "T", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 9, "alt_7", "TT", ["TC"], None, None, {"ZOI": "no"}), # Substitution before end of upstream primer VCFRecord("artificial_chr1", 10, "alt_8", "TG", ["TC"], None, None, {"ZOI": "yes"}), # Substitution in upstream limit of ZOI VCFRecord("artificial_chr1", 15, "alt_9", "GG", ["GC"], None, None, {"ZOI": "yes"}), # Substitution in dosnstream limit of ZOI VCFRecord("artificial_chr1", 20, "alt_10", "GT", ["GC"], None, None, {"ZOI": "no"}), # Substitution after start of downstream primer VCFRecord("artificial_chr1", 21, "alt_11", "TT", ["TC"], None, None, {"ZOI": "no"}), # Substitution in downstream primer VCFRecord("artificial_chr2", 1, "alt_12", "C", ["CTT"], None, None, {"ZOI": "no"}), # Insertion before end of upstream primer VCFRecord("artificial_chr2", 2, "alt_13", "G", ["GCC"], None, None, {"ZOI": "yes"}), # Insertion in upstream limit of ZOI VCFRecord("artificial_chr2", 3, "alt_14", "AT", ["CCGC"], None, None, {"ZOI": "yes"}), # Insertion in upstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 9, "alt_15", "G", ["GCC"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI VCFRecord("artificial_chr2", 9, "alt_16", "G", ["NNN"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 10, "alt_17", "-", ["CC"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI VCFRecord("artificial_chr2", 10, "alt_18", "A", ["ATT"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 1, "alt_19", "CG", ["C"], None, None, {"ZOI": "no"}), # Deletion before end of upstream primer VCFRecord("artificial_chr2", 2, "alt_20", "GA", ["G"], None, None, {"ZOI": "yes"}), # Deletion in upstream limit of ZOI VCFRecord("artificial_chr2", 3, "alt_21", "AT", ["C"], None, None, {"ZOI": "yes"}), # Deletion in upstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 6, "alt_22", "NNCG", ["N"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI VCFRecord("artificial_chr2", 8, "alt_23", "CG", ["C"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI VCFRecord("artificial_chr2", 8, "alt_24", "CG", ["T"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 9, "alt_25", "GA", ["G"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 10, "alt_26", "A", ["-"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 10, "alt_27", "AT", ["A"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
help= 'Path to the definition of the amplicons (format: Illumina manifest).') group_output = parser.add_argument_group('Outputs') # Outputs group_output.add_argument( '-f', '--fwd-barcodes', default="fwd_barcodes.fasta", help= '******************************** (format: fasta). [Default: %(default)s]' ) group_output.add_argument( '-r', '--rvs-barcodes', default="rvs_barcodes.fasta", help= '******************************** (format: fasta). [Default: %(default)s]' ) args = parser.parse_args() # Process amplicons = getAmplicons(args.input_manifest) FH_fwd = FastaIO(args.fwd_barcodes, "w") FH_rvs = FastaIO(args.rvs_barcodes, "w") for ampl in amplicons: record_fwd = Sequence(ampl.name, ampl.up_primer) FH_fwd.write(record_fwd) record_rvs = Sequence(ampl.name, revcom(ampl.down_primer)) FH_rvs.write(record_rvs) FH_fwd.close() FH_rvs.close()
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFTargets.py", "--mode", "remove", "--input-variants", self.tmp_variants, "--input-targets", self.tmp_regions, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write( Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create targets with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed: FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1")) FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2")) FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3")) # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "target": HeaderInfoAttr("target", "The ID of the overlapped target.", type="String", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_01", "G", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_04", "G", ["C"], None, None, {"target": None}), # After target ; first nt after target # Substit multi nt VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG", ["GTACCCGC"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG", ["GTACCCGC"], None, None, {"target": "target_2"}), # Overlap target start VCFRecord("artificial_chr1", 13, "alt_07", "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None, None, {"target": "target_2"}), # Include target VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA", ["CGCCCCTACATTGC"], None, None, {"target": "target_2"}), # Exact target VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA", ["GTACCCG"], None, None, {"target": "target_2"}), # Included by target VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA", ["GTACCCG"], None, None, {"target": "target_2"}), # Overlap target end VCFRecord( "artificial_chr1", 29, "alt_11", "GTAGTAGAT", ["GTACCCGA"], None, None, {"target": None}), # After target ; first nt after target # Ins single nt VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord( "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None, {"target": "target_2"}), # Movable to last nt of target # Del single nt VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_20", "T", [""], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_23", "G", [""], None, None, {"target": None}), # After target ; first nt afetr target # Del multi nt VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None, None, {"target": "target_2" }), # On repeat and movable to first nt of target VCFRecord( "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None, {"target": "target_2"}), # Movable to last nt of target VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None, None, {"target": "target_2" }), # On repeat and movable to last nt of target ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length", "4", "--input-variants", self.tmp_variants, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # 12 16 20 24 28 32 36 40 44 48 52 56 60 64 68 72 76 80 84 88 92 96 100 # 2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102 # | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | FH_seq.write( Sequence( "artificial_chr1", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr2", "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA" )) FH_seq.write( Sequence( "artificial_chr3", "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr4", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG" )) FH_seq.write( Sequence( "artificial_chr5", "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr6", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT" )) """ Variant before_start before_end before_seq after_start after_end after_seq alt_00 10 13 TCCA 15 18 CAAT alt_01 20 23 AAAA 25 28 TTCC alt_02 30 33 ACAG 35 38 AAAA alt_03 40 43 AGTA 45 48 AAAG alt_04 10 13 TCCA 16 19 AATA alt_05 20 23 AAAA 26 29 TCCT alt_06 30 33 ACAG 36 39 AAAA alt_07 40 43 GTAG 46 49 AAAG alt_08 11 14 CCAG 15 18 CAAT alt_09 20 23 AAAA 24 27 TTCC alt_10 31 34 AGGT 35 38 AAAA alt_11 40 43 GTAG 44 47 AAAG alt_12 11 14 CCAG 15 18 CAAT alt_13 20 23 AAAA 24 27 GTTC alt_14 31 34 CAGG 35 38 AAAA alt_15 41 44 GTAG 45 48 AAAG alt_16 50 53 GAAA 57 60 GTCA alt_17 60 63 AAAA 67 70 TATT alt_18 70 73 TCTC 77 80 AAAA alt_19 80 83 ACAG 87 90 AAAG alt_20 11 14 CCAG 16 19 AATA alt_21 20 23 AAAA 25 28 TTCC alt_22 31 34 CAGG 36 39 AAAA alt_23 40 43 AGTA 45 48 AAAG alt_24 11 14 CCAG 17 20 ATAA alt_25 19 22 AAAA 26 29 TCCT alt_26 29 32 TACA 35 38 AAAA alt_27 38 41 AAAG 45 48 AAAG alt_28 50 53 ACAA 61 64 CTTG alt_29 66 69 AAAA 76 79 CACA alt_30 76 79 CACA 86 89 AAAA alt_31 88 91 AACA 99 102 AAAT """ # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 89 17 89 90 artificial_chr2 89 124 89 90 artificial_chr3 88 231 88 89 artificial_chr4 95 337 95 96 artificial_chr5 89 450 89 90 artificial_chr6 102 557 102 103""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "is_filtered": HeaderInfoAttr( "is_filtered", "1 if the variant is adjacent to an homopolymer.", type="Integer", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr1", 24, "alt_01", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr1", 34, "alt_02", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr1", 44, "alt_03", "G", ["T"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Substit multi nt VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins single nt VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins multi nt VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer # Del single nt VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # # Del multi nt VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
args = parser.parse_args() # Logger initialisation logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.info(" ".join(sys.argv)) log.info("Random seed used: {}".format(args.random_seed)) # Get number of duplications by reads log.info("Get duplication count for each read") random.seed(args.random_seed) nb_reads = FastaIO.nbSeq(args.input_R1) if nb_reads < 10000: log.error( "The number of reads in {} is unsufficient to simulate duplication (found: {} ; expected: {})." .format(args.input_R1, nb_reads, 10000)) nb_occurences = getNbOccur(args.duplication_profile, nb_reads) # Witre reads log.info("Write reads") with FastaIO(args.output_R1, "w") as FH_out_R1: with FastaIO(args.output_R2, "w") as FH_out_R2: with FastaIO(args.input_R1) as FH_in_R1: with FastaIO(args.input_R2) as FH_in_R2: for curr_nb_occur, R1, R2 in zip(nb_occurences, FH_in_R1, FH_in_R2): description = "dupCount={}".format(curr_nb_occur)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "standardizeVCF.py", "--trace-unstandard", "--input-reference", self.tmp_sequences, "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."), "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."), "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, { "expected": ["artificial_chr1:14=G/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["A|ann_1|0", "A|ann_2|0"] }), # Substit multi nt VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, { "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"] }), VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, { "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"] }), # Insertion single nt VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, { "expected": ["artificial_chr1:14=G/GA"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, { "expected": ["artificial_chr1:19=T/TA"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, { "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"] }), VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"], "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"] }), # Insertion multi nt VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, { "expected": ["artificial_chr1:14=G/GATGC"], "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, { "expected": ["artificial_chr1:19=T/TAAATC"], "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"] }), # Movable insertion multi nt VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, { "expected": ["artificial_chr1:12=A/ATG"], "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"] }), VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, { "expected": ["artificial_chr1:25=C/CAAA"], "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"], "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"] }), # Deletion single nt VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, { "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"], "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"] }), # Movable deletion multi nt VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, { "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"], "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"] }), ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def testWrite(self): with FastaIO(self.tmp_out, "w") as FH_out: for curr_rec in self.expected_rec: FH_out.write(curr_rec) self.assertTrue(FastaIO.isValid(self.tmp_out)) self.assertTrue(filecmp.cmp(self.tmp_out, self.tmp_mono_line))