def main(argv=None): settings, args = process_command_line(argv) # Read the read names and positions read_5ps = {} read_3ps = {} read_genes = {} genome = SeqIO.read(settings.genome, "fasta").seq for line in csv.reader(open(settings.list_reads), delimiter="\t"): if line[7] == "single": continue read_5ps[line[6]] = [int(line[1]) - 1, line[2]] read_3ps[line[6]] = [int(line[4]) - 1, line[5]] read_genes[line[6]] = [] # [line[1], line[4]] # Read the bam file and return the long sequences r1_seqs, r2_seqs = get_reads_seqs(pysam.Samfile(settings.bamfile), read_genes.keys()) outer = csv.writer(open(settings.printto, "w"), delimiter="\t") pssm_before = defaultdict(lambda: defaultdict(int)) pssm_after = defaultdict(lambda: defaultdict(int)) # For each read find the overlap, if exists and find the fusion point for rname in set(r1_seqs.keys()) & set(r2_seqs.keys()): s1, overlap, s2 = find_overlap(r2_seqs[rname], r1_seqs[rname]) if len(overlap) < settings.overlap: continue # print rname # print "%s %s %s"%(s1, overlap, s2) side_5p_len = extend_alignment(s1 + overlap + s2, read_5ps[rname][0], 0, False, read_5ps[rname][1], genome) side_3p_len = extend_alignment(s1 + overlap + s2, 0, read_3ps[rname][0], True, read_3ps[rname][1], genome) # print "%d %d %d %d %d"%(side_5p_len, side_3p_len, len(s1), len(overlap), len(s2)) if side_5p_len + side_3p_len == len(s1) + len(overlap) + len(s2): # Report this as a fusion point f1_point = read_5ps[rname][0] + side_5p_len - 1 f1_seq = genome[read_5ps[rname][0] : f1_point + 1] # f1_seq = genome[f1_point-settings.width: f1_point+1] if read_5ps[rname][1] == "-": f1_point = read_5ps[rname][0] - side_5p_len + 1 f1_seq = genome[f1_point : read_5ps[rname][0] + 1].reverse_complement() # f1_seq = genome[f1_point:f1_point+settings.width+1].reverse_complement() f2_point = read_3ps[rname][0] - side_3p_len + 1 f2_seq = genome[f2_point : read_3ps[rname][0] + 1] # f2_seq = genome[f2_point:f2_point+settings.width+1] if read_3ps[rname][1] == "-": f2_point = read_3ps[rname][0] + side_3p_len - 1 f2_seq = genome[read_3ps[rname][0] : f2_point + 1].reverse_complement() # f2_seq = genome[f2_point-settings.width:f2_point+1].reverse_complement() acc1 = computeACC((s1 + overlap)[:side_5p_len], winlen=1) acc2 = computeACC((overlap + s2)[-side_3p_len:], winlen=1) outer.writerow( [rname] + read_genes[rname] + read_5ps[rname] + read_3ps[rname] + [f1_point, f1_seq, f2_point, f2_seq] + acc1[-settings.width :] + acc2[: settings.width] ) # Add the sequences to the pssm for i, nt in enumerate(f1_seq): pssm_before[len(f1_seq) - i][nt] += 1 for i, nt in enumerate(f2_seq): pssm_after[i][nt] += 1 print "%sNN%s" % (f1_seq, f2_seq) # print pssm_before # print pssm_after for nt in ("A", "C", "G", "T"): for i in range(settings.width + 1, 0, -1): sys.stdout.write("%s\t" % pssm_before[i][nt]) sys.stdout.write("\n") for nt in ("A", "C", "G", "T"): for i in range(settings.width + 1): sys.stdout.write("%s\t" % pssm_after[i][nt]) sys.stdout.write("\n") return 0 # success
def run(argv=None): try: settings, args = process_command_line(argv) except BaseException: return 1 FRAG_1_SEQ_INDEX = 6 FRAG_2_SEQ_INDEX = 8 FRAG_1_SCORE_START = 9 positive_pos_score_list = [] negative_pos_score_list = [] # Get the reads before and after ligation point with open(settings.reads_file, "rb") as fl: for line in fl: row = line.replace("\n", "").split("\t") FRAG_2_SCORE_START = FRAG_1_SCORE_START + len(row[FRAG_1_SEQ_INDEX]) if len(row[FRAG_1_SEQ_INDEX]) + len(row[FRAG_2_SEQ_INDEX]) != len(row[FRAG_1_SCORE_START:]): print "-" * 100 print "skipped row" print row[FRAG_1_SEQ_INDEX] print row[FRAG_2_SEQ_INDEX] print "-" * 100 continue negative_pos_score_list.append(row[FRAG_1_SEQ_INDEX]) positive_pos_score_list.append(row[FRAG_2_SEQ_INDEX]) NUM_OF_BASES_TO_EXCHANGE = 4 perms = generate_permutation(NUM_OF_BASES_TO_EXCHANGE) scores_per_fragment = [] i = 1 # Calculate the score for each possible end for the fragments before # the ligation point for frag in negative_pos_score_list: base_seq = frag[:-NUM_OF_BASES_TO_EXCHANGE] for extension in perms: scores_per_fragment.append(computeACC(base_seq + extension, winlen=1)) row = [str(i), base_seq + extension] + [str(val) for val in scores_per_fragment[-1]] print "\t".join(row) i += 1 # calculate statistics for last postions postions_scores = [[] for i in range(NUM_OF_BASES_TO_EXCHANGE)] for score_list in scores_per_fragment: for i in range(NUM_OF_BASES_TO_EXCHANGE): postions_scores[i].append(score_list[-NUM_OF_BASES_TO_EXCHANGE + i]) for i in range(NUM_OF_BASES_TO_EXCHANGE): print "position:" print "-" * 50 print "mean:", np.mean(postions_scores[i]) print "median:", np.median(postions_scores[i]) print "max:", max(postions_scores[i]) print "min:", min(postions_scores[i]) print "\n" print "*" * 100 print "*" * 100 print "finished negatives, starting positives" print "*" * 100 print "*" * 100 # Calculate the score for each possible end for the fragments before # the ligation point for frag in positive_pos_score_list: base_seq = frag[:-NUM_OF_BASES_TO_EXCHANGE] for extension in perms: scores_per_fragment.append(computeACC(base_seq + extension, winlen=1)) row = [str(i), base_seq + extension] + [str(val) for val in scores_per_fragment[-1]] print "\t".join(row) i += 1 # calculate statistics for first postions postions_scores = [[] for i in range(NUM_OF_BASES_TO_EXCHANGE)] for score_list in scores_per_fragment: for i in range(NUM_OF_BASES_TO_EXCHANGE): postions_scores[i].append(score_list[i]) for i in range(NUM_OF_BASES_TO_EXCHANGE): print "position:" print "-" * 50 print "mean:", np.mean(postions_scores[i]) print "median:", np.median(postions_scores[i]) print "max:", max(postions_scores[i]) print "min:", min(postions_scores[i]) print "\n" return 0
def main(argv=None): settings, args = process_command_line(argv) # Read the read names and positions read_5ps = {} read_3ps = {} read_genes = {} genome = SeqIO.read(settings.genome, 'fasta').seq for line in csv.reader(open(settings.list_reads), delimiter='\t'): if line[7] == 'single': continue read_5ps[line[6]] = [int(line[1]) - 1, line[2]] read_3ps[line[6]] = [int(line[4]) - 1, line[5]] read_genes[line[6]] = [] #[line[1], line[4]] # Read the bam file and return the long sequences r1_seqs, r2_seqs = get_reads_seqs(pysam.Samfile(settings.bamfile), read_genes.keys()) outer = csv.writer(open(settings.printto, 'w'), delimiter='\t') pssm_before = defaultdict(lambda: defaultdict(int)) pssm_after = defaultdict(lambda: defaultdict(int)) # For each read find the overlap, if exists and find the fusion point for rname in set(r1_seqs.keys()) & set(r2_seqs.keys()): s1, overlap, s2 = find_overlap(r2_seqs[rname], r1_seqs[rname]) if len(overlap) < settings.overlap: continue # print rname # print "%s %s %s"%(s1, overlap, s2) side_5p_len = extend_alignment(s1 + overlap + s2, read_5ps[rname][0], 0, False, read_5ps[rname][1], genome) side_3p_len = extend_alignment(s1 + overlap + s2, 0, read_3ps[rname][0], True, read_3ps[rname][1], genome) # print "%d %d %d %d %d"%(side_5p_len, side_3p_len, len(s1), len(overlap), len(s2)) if side_5p_len + side_3p_len == len(s1) + len(overlap) + len(s2): # Report this as a fusion point f1_point = read_5ps[rname][0] + side_5p_len - 1 f1_seq = genome[read_5ps[rname][0]:f1_point + 1] # f1_seq = genome[f1_point-settings.width: f1_point+1] if read_5ps[rname][1] == '-': f1_point = read_5ps[rname][0] - side_5p_len + 1 f1_seq = genome[f1_point:read_5ps[rname][0] + 1].reverse_complement() # f1_seq = genome[f1_point:f1_point+settings.width+1].reverse_complement() f2_point = read_3ps[rname][0] - side_3p_len + 1 f2_seq = genome[f2_point:read_3ps[rname][0] + 1] # f2_seq = genome[f2_point:f2_point+settings.width+1] if read_3ps[rname][1] == '-': f2_point = read_3ps[rname][0] + side_3p_len - 1 f2_seq = genome[read_3ps[rname][0]:f2_point + 1].reverse_complement() # f2_seq = genome[f2_point-settings.width:f2_point+1].reverse_complement() acc1 = computeACC((s1 + overlap)[:side_5p_len], winlen=1) acc2 = computeACC((overlap + s2)[-side_3p_len:], winlen=1) outer.writerow([rname] + read_genes[rname] + read_5ps[rname] + read_3ps[rname] + [f1_point, f1_seq, f2_point, f2_seq] + acc1[-settings.width:] + acc2[:settings.width]) # Add the sequences to the pssm for i, nt in enumerate(f1_seq): pssm_before[len(f1_seq) - i][nt] += 1 for i, nt in enumerate(f2_seq): pssm_after[i][nt] += 1 print "%sNN%s" % (f1_seq, f2_seq) # print pssm_before # print pssm_after for nt in ('A', 'C', 'G', 'T'): for i in range(settings.width + 1, 0, -1): sys.stdout.write("%s\t" % pssm_before[i][nt]) sys.stdout.write("\n") for nt in ('A', 'C', 'G', 'T'): for i in range(settings.width + 1): sys.stdout.write("%s\t" % pssm_after[i][nt]) sys.stdout.write("\n") return 0 # success
def run(argv=None): try: settings, args = process_command_line(argv) except BaseException: return 1 FRAG_1_SEQ_INDEX = 6 FRAG_2_SEQ_INDEX = 8 FRAG_1_SCORE_START = 9 positive_pos_score_list = [] negative_pos_score_list = [] # Get the reads before and after ligation point with open(settings.reads_file, "rb") as fl: for line in fl: row = line.replace("\n", "").split("\t") FRAG_2_SCORE_START = FRAG_1_SCORE_START + len( row[FRAG_1_SEQ_INDEX]) if len(row[FRAG_1_SEQ_INDEX]) + len(row[FRAG_2_SEQ_INDEX]) != len( row[FRAG_1_SCORE_START:]): print "-" * 100 print "skipped row" print row[FRAG_1_SEQ_INDEX] print row[FRAG_2_SEQ_INDEX] print "-" * 100 continue negative_pos_score_list.append(row[FRAG_1_SEQ_INDEX]) positive_pos_score_list.append(row[FRAG_2_SEQ_INDEX]) NUM_OF_BASES_TO_EXCHANGE = 4 perms = generate_permutation(NUM_OF_BASES_TO_EXCHANGE) scores_per_fragment = [] i = 1 # Calculate the score for each possible end for the fragments before # the ligation point for frag in negative_pos_score_list: base_seq = frag[:-NUM_OF_BASES_TO_EXCHANGE] for extension in perms: scores_per_fragment.append( computeACC(base_seq + extension, winlen=1)) row = [str(i), base_seq + extension ] + [str(val) for val in scores_per_fragment[-1]] print "\t".join(row) i += 1 # calculate statistics for last postions postions_scores = [[] for i in range(NUM_OF_BASES_TO_EXCHANGE)] for score_list in scores_per_fragment: for i in range(NUM_OF_BASES_TO_EXCHANGE): postions_scores[i].append(score_list[-NUM_OF_BASES_TO_EXCHANGE + i]) for i in range(NUM_OF_BASES_TO_EXCHANGE): print "position:" print "-" * 50 print "mean:", np.mean(postions_scores[i]) print "median:", np.median(postions_scores[i]) print "max:", max(postions_scores[i]) print "min:", min(postions_scores[i]) print "\n" print "*" * 100 print "*" * 100 print "finished negatives, starting positives" print "*" * 100 print "*" * 100 # Calculate the score for each possible end for the fragments before # the ligation point for frag in positive_pos_score_list: base_seq = frag[:-NUM_OF_BASES_TO_EXCHANGE] for extension in perms: scores_per_fragment.append( computeACC(base_seq + extension, winlen=1)) row = [str(i), base_seq + extension ] + [str(val) for val in scores_per_fragment[-1]] print "\t".join(row) i += 1 # calculate statistics for first postions postions_scores = [[] for i in range(NUM_OF_BASES_TO_EXCHANGE)] for score_list in scores_per_fragment: for i in range(NUM_OF_BASES_TO_EXCHANGE): postions_scores[i].append(score_list[i]) for i in range(NUM_OF_BASES_TO_EXCHANGE): print "position:" print "-" * 50 print "mean:", np.mean(postions_scores[i]) print "median:", np.median(postions_scores[i]) print "max:", max(postions_scores[i]) print "min:", min(postions_scores[i]) print "\n" return 0