def create_sgr(output_dir, eland_file_path, chr): eland = ElandFile(eland_file_path, 'r') output = open(os.path.join(output_dir, chr + ".sgr"), 'w') signals = {} for hit in eland: read_length = len(hit.sequence) if hit.strand == 'F': start = int(hit.coordinate) stop = int(hit.coordinate) + WINDOW elif hit.strand == 'R': start = max(int(hit.coordinate) + read_length - WINDOW, 1) stop = int(hit.coordinate) + read_length if start in signals: signals[start] += 1 else: signals[start] = 1 if stop in signals: signals[stop] += -1 else: signals[stop] = -1 sorted_keys = signals.keys() sorted_keys.sort() height = 0 for coord in sorted_keys: height += signals[coord] s = SGR(chr, coord, height) output.write(str(s) + "\n") eland.close() output.close()
def merge_unique_eland(output, mapped_reads_files, mismatches=2): eland_out = ElandFile(output, 'w') for i in mapped_reads_files: if not os.path.exists(i): raise Exception("File %s does not exist" % i) if i.endswith('.bam'): convert_bam(eland_out, i, mismatches) continue if i.endswith('.sam'): convert_sam(eland_out, i, mismatches) continue if 'multi' in i: eland_in = ElandMultiFile(i, 'r') elif 'extended' in i: eland_in = ElandExtendedFile(i, 'r') else: eland_in = ElandFile(i, 'r') total_passed = 0 for i, line in enumerate(eland_in): best_hits = line.best_matches() if len(best_hits) == 0: continue elif len(best_hits) > 1: continue # Only merge unique hits elif best_hits[0].number_of_mismatches() > mismatches: continue else: total_passed += 1 eland_out.write(line.convert_to_eland()) print "unique eland: total lines", i, "total passed", total_passed eland_in.close() eland_out.close()
def divide_eland_by_chr(eland_file, genome, output_dir=""): chr_files = {} chr_map = get_chr_mapping(genome) i = ElandFile(eland_file, 'r') for line in i: if line.chr_name not in chr_map: #print "%s not a valid chromosome name, skipping." % line.chr_name continue o = open_chr_file(line.chr_name, chr_files, genome, chr_map, output_dir) o.write(line) i.close() for f in chr_files.values(): f.close()
def open_chr_file(chr_name, chr_files, genome, chr_map, output_dir=""): if chr_name in chr_files: return chr_files[chr_name] else: f = ElandFile( os.path.join(output_dir, '%s_eland.txt' % (chr_map[chr_name])), 'w') chr_files[chr_name] = f return f
def merge_unique_eland(output, mapped_reads_files, mismatches=2): print "merge_filter %s to %s" % (','.join(mapped_reads_files), output) eland_out = ElandFile(output, 'w') for i in mapped_reads_files: if not os.path.exists(i): raise Exception("File %s does not exist" % i) if i.endswith('.bam'): convert_bam(eland_out, i, mismatches) continue if i.endswith('.sam'): convert_sam(eland_out, i, mismatches) continue if 'multi' in i: print " multi eland ..." eland_in = ElandMultiFile(i, 'r') elif 'extended' in i: print " extended ..." eland_in = ElandExtendedFile(i, 'r') else: print "ElandFile ..." eland_in = ElandFile(i, 'r') total_passed = 0 for i, line in enumerate(eland_in): best_hits = line.best_matches() if len(best_hits) == 0: continue elif len(best_hits) > 1: continue # Only merge unique hits elif best_hits[0].number_of_mismatches() > mismatches: continue else: total_passed += 1 eland_out.write(line.convert_to_eland()) print "best_hits", best_hits print "unique eland: total lines", i, "total passed", total_passed eland_in.close() eland_out.close()