def generate_fastq_depending_on_alignment_results( inSam, outputs, check_multimapFunction=default_check_multimap): #Function creates new Fastq file from inSam where kept only reads that are #in accordance with alignment class #inSam - referemce Sam filename #outputs - dict {alignment_class : handler} #where alignment_classes are #1 = "unique" #2 = "repeats" #0 = "unmapped" #and handler is either file handler or #None to throw away reads # #see analyze_sam_file for check_multimapFunction parameter startLog(None) stats = dict([(i, 0) for i in [0, 1, 2]]) for read, category in sam_file_analyzer_generator(inSam, check_multimapFunction): stats[category] += 1 if category in outputs.keys(): outputs[category].write("@" + read.query_name + "\n" + read.query_sequence + "\n+\n") for q in read.query_qualities: outputs[category].write(chr(q + 33)) outputs[category].write("\n") logging.info( "Sam file " + inSam + " filtered:\n" + "\n".join([str(i) + ":\t" + str(j) for i, j in stats.iteritems()]))
def __init__(self, R1sam, R2sam, replaceFiles=True, logFileName=None): #R1sam, R2sam - sam files with left and right reads #genome - the mirnylab genome instance #replaceFiles - ensure that output files can be rewritten if exist self.R1sam = R1sam self.R2sam = R2sam self.replaceFiles = replaceFiles startLog(filename=logFileName) logging.info("Starting repeat localizer") random.seed()
def splif_fastq_depending_on_alignment_results( inFastq, inSam, outputs, check_multimapFunction=default_check_multimap): #Function creates new Fastq file from inFastq where kept only reads that are #in accordance with alignment class (based on inSam) #inFastq - input Fastq filename #inSam - referemce Sam filename #outputs - dict #alignment_class : handler #where alignment_classes are #1 = "unique" #2 = "repeats" #0 = "unmapped" #and handler is either file handler or #None to throw away reads # #see analyze_sam_file for check_multimapFunction parameter startLog(None) alignment_results = analyze_sam_file( inSam, check_multimapFunction=check_multimapFunction) stats = {} for i in [0, 1, 2]: if not i in outputs.keys(): outputs[i] = None else: stats[i] = 0 with open(inFastq, "r") as fin: for line in fin: alignment_class = alignment_results[line[1:].strip().split()[0]] if outputs[alignment_class] != None: stats[alignment_class] += 1 outputs[alignment_class].write(line) outputs[alignment_class].write(fin.next()) outputs[alignment_class].write(fin.next()) outputs[alignment_class].write(fin.next()) else: fin.next() fin.next() fin.next() logging.info( "Fastq " + inFastq + " filtered:\n" + "\n".join([str(i) + ":\t" + str(j) for i, j in stats.iteritems()]))
def __init__(self, inFastq, outSam, enzymeName, replaceFiles=False, aligner_function=None, aligner_parameters=None, logFileName="HiCmapper.log"): #inFastq - input fastq file with reads file name #outSam - file name of the sam file for alignment #aligner_function - function which call aligning software. Function must accepts agrs inFastq and outSam. Optional #aligner_parameters - list of additional parameters to be sent to aligner. Paramteres to pass to alignment software #example: "--very-sencitive -x path-to-file -p 8" #enzymeName - name of RE enzyme used for Hi-C data #replaceFiles - replace output and temp files #logFileName - file name for logs startLog(logFileName, loglevel=logging.DEBUG) self.statistics = {} #TODO intrpduce special statistics class if aligner_function != None: self.perform_alignmen == aligner_function self.aligner_parameters = aligner_parameters assert enzymeName in Bio.Restriction.Restriction_Dictionary.rest_dict.keys( ) self.enzyme = eval( "Bio.Restriction." + enzymeName ) #eval is dangerous. Always use check of str that you pass self.enzymeSeq = self.enzyme.site.upper() st = self.enzyme.elucidate().find( "^") #example: EcoRI.elucidate() returns 'G^AATT_C' end = self.enzyme.elucidate().find("_") self.enzymeFilledInSeq = (self.enzymeSeq[:end - 1] + self.enzymeSeq[st:]).upper() logging.debug("Uing filled-in enzyme seq: " + self.enzymeFilledInSeq) self.inFastq = inFastq self.outSam = outSam self.replace_files = replaceFiles #hic_check_multimap is a function to check wheather read has multiple alignments #this function should be defined separately for each aligner if not hasattr(self, 'hic_check_multimap'): logging.warning( "Using default function to check multiple alignments") self.hic_check_multimap = default_check_multimap
import numpy as np import sys sys.path.append("/mnt/storage/home/vsfishman/tmp/HiC_repeats/scripts") from miscellaneous import startPmDebug,startLog from NGSseq_utils import * from Hi_C_mapper import * startPmDebug() basedir = "/mnt/storage/home/vsfishman/tmp/HiC_repeats/out/Battulin2015/" uniqueSam_R1 = basedir+"2500k_bowtie2_R1.sam" uniqueSam_R2 = basedir+"2500k_bowtie2_R2.sam" guesResults = basedir+"2500k.unique.fq.trim30.fq.realigned.coords" guesResults = basedir + "2500k"+".unique.fq"+".trim30.fq"+"R2Untrimmed.paired.coords" startLog(filename=None) mapper = Cbowtie2_Hi_C_mapper("","","HindIII",logFileName=None) reads_pairs = {} for line in open(guesResults): line = line.strip().split() reads_pairs[line[0]] = map(int,line[1:]) dist1 = [] r1notFound = 0 r2notFound = 0 for r1,status in sam_file_analyzer_generator(uniqueSam_R1,check_multimapFunction=mapper.hic_check_multimap): if status == 1: