Exemple #1
0
def simulate_reads(ref, chr, readlen, readcov, dupnum, duplen, sam, chr_len):
    final_SAM_fns = [] # List containing all SAM files to be combined together. First contains regular read simulation SAM file, with regional duplication SAM files added later
    chr_list = []
    pattern = "Chr\d{1,}"
    recomp = re.compile(pattern)
    if chr == "All":
        for chr_temp in chr_len:
            match = recomp.match(chr_temp)
            if match:
                chr_list.append(chr_temp)
    else:
        chr_list = [entry for entry in chr.split(',')]
    for chr in chr_list:
        for i in range(0,len(chr_len[chr]),10000):
            out_fn = str(sam[:-3]) + "_temp_" + str(chr) + "_" + str(i)
            new_fasta = ">" + str(chr) + "\n" + chr_len[chr][i:i+10000]
            temp_fasta = "regtemp_" + str(i) + ".fasta"
            with open(temp_fasta, 'w') as outfile:
                outfile.write(new_fasta)
            params = ' '.join(["art_illumina.exe", "-i", str(temp_fasta), "-l", str(readlen), "-f", str(readcov), "-o", out_fn, "-sam", "-q"])
            #Example usage: art_illumina.exe -i ../at9_chr3.fasta -l 34 -f 2 -o Output/RandomAtReads -sam
            simulation = subprocess.Popen(params)
            simulation.wait()
            new_out_fn = out_fn + ".sam"
            if i > 0:
                reg_SAM = SAM.parse(new_out_fn, True)
                for j in range(0,len(reg_SAM.sam_list)):
                    reg_SAM.sam_list[j][3] = str(int(reg_SAM.sam_list[j][3]) + i) # Changing SAM file read numbers
                reg_SAM.output(new_out_fn)
            os.remove(temp_fasta)
            os.remove(out_fn + ".fq")
            os.remove(out_fn + ".aln")
            final_SAM_fns.append(new_out_fn) # Regular read profile files assigned here

    prev_sel = []
    for i in range(1,dupnum+1):
        spos = 0
        undupped_region = False
        ok_regions = 0
        while undupped_region == False:
            chr = ''.join(sample(chr_list,1))
            spos = randrange(0,len(chr_len[chr]))
            epos = spos + duplen
            if epos > len(chr_len[chr]): continue
            for entry in prev_sel:
                (c, start, end) = entry
                if chr == c:
                    if not (int(spos) >= int(start) and int(spos) <= int(end)) or (int(epos) >= int(start) and int(epos) <= int(end)):
                        # Testing to make sure that newly selected duplicated region is not within a region already selected to be duplicated
                        ok_regions += 1
                    else:
                        break
                else:
                    ok_regions += 1
            if ok_regions == len(prev_sel):
                undupped_region = True
                prev_sel.append((chr,spos,epos))
        new_fasta = ">" + str(chr) + "\n" + chr_len[chr][spos:epos]
        temp_fasta = "duptemp_" + str(i) + ".fasta"
        with open(temp_fasta, 'w') as outfile:
            outfile.write(new_fasta)
        params = ' '.join(["art_illumina.exe", "-i", str(temp_fasta), "-l", str(readlen), "-f", str(readcov), "-o", str(temp_fasta[:-6]), "-sam", "-q"])
        simulation = subprocess.Popen(params)
        simulation.wait()
        dup_fn = temp_fasta[:-6] + ".sam"
        dup_SAM = SAM.parse(dup_fn, True)
        for i in range(0,len(dup_SAM.sam_list)):
            dup_SAM.sam_list[i][3] = str(int(dup_SAM.sam_list[i][3]) + spos)
        dup_SAM.output(dup_fn[:-4] + "_temp.sam")
        os.remove(temp_fasta)
        os.remove(temp_fasta[:-6] + ".fq")
        os.remove(temp_fasta[:-6] + ".aln")
        os.remove(dup_fn)
        final_SAM_fns.append(dup_fn[:-4] + "_temp.sam")
    
    final_SAM = SAM.parse(final_SAM_fns, True)
    pattern2 = r"Chr\d{1}-(\d+)"
    recomp2 = re.compile(pattern2)
    
    max_read = final_SAM.sam_list[0][0]
    match = recomp2.match(max_read)
    max_num = int(match.group(1))
    min_met = False
    for i in range(0, len(final_SAM.sam_list)): # Ensure that read names from one SAM file do not coincide with read names from another SAM file
        r_name = final_SAM.sam_list[i][0]
        match = recomp2.match(r_name)
        if match:
            r_num = int(match.group(1))
            if r_num == 1 and min_met == False:
                min_met = True
                continue
            if min_met == True:
                max_num += 1
                line = final_SAM.sam_list[i]
                line[0] = r_name[:5] + str(max_num)
                final_SAM.sam_list[i] = line
    final_SAM.header = sorted(final_SAM.header, key = lambda read: read[1][6:])
    final_SAM.sam_list = sorted(final_SAM.sam_list, key = lambda read: int(read[0][5:]))
    for i in range(0,len(final_SAM.header)): # Set chromosome length in header portion of SAM file to correct length (will be 10,000 in temporary simulation SAM files)
        chr = final_SAM.header[i][1][3:]
        final_SAM.header[i][2] = "LN:" + str(len(chr_len[chr]))
    final_SAM.output(sam[:-3])
    for file in final_SAM_fns:
        os.remove(file)
    params = ' '.join([str(pypath) + " FixARTSAMFile.py", "-i", sam[:-3], "-o", sam])
    fixSAM = subprocess.Popen(params) # Replaces new CIGAR string format for matches, which uses = and X, to the old format M for both
    fixSAM.wait()
    print("Duplicated regions are located at:\n")
    for entry in sorted(prev_sel, key = lambda entry:(entry[0], entry[1])):
        (chr, spos, epos) = entry
        print(str(chr), ":", str(spos),"-", str(epos), sep="")