Example #1
0
def generate_fastq_depending_on_alignment_results(
        inSam, outputs, check_multimapFunction=default_check_multimap):
    #Function creates new Fastq file from inSam where kept only reads that are
    #in accordance with alignment class

    #inSam - referemce Sam filename
    #outputs - dict {alignment_class : handler}
    #where alignment_classes are
    #1 = "unique"
    #2 = "repeats"
    #0 = "unmapped"
    #and handler is either file handler or
    #None to throw away reads
    #
    #see analyze_sam_file for check_multimapFunction parameter

    startLog(None)
    stats = dict([(i, 0) for i in [0, 1, 2]])
    for read, category in sam_file_analyzer_generator(inSam,
                                                      check_multimapFunction):
        stats[category] += 1
        if category in outputs.keys():
            outputs[category].write("@" + read.query_name + "\n" +
                                    read.query_sequence + "\n+\n")
            for q in read.query_qualities:
                outputs[category].write(chr(q + 33))
            outputs[category].write("\n")
    logging.info(
        "Sam file " + inSam + " filtered:\n" +
        "\n".join([str(i) + ":\t" + str(j) for i, j in stats.iteritems()]))
Example #2
0
 def __init__(self, R1sam, R2sam, replaceFiles=True, logFileName=None):
     #R1sam, R2sam - sam files with left and right reads
     #genome - the mirnylab genome instance
     #replaceFiles - ensure that output files can be rewritten if exist
     self.R1sam = R1sam
     self.R2sam = R2sam
     self.replaceFiles = replaceFiles
     startLog(filename=logFileName)
     logging.info("Starting repeat localizer")
     random.seed()
Example #3
0
def splif_fastq_depending_on_alignment_results(
        inFastq,
        inSam,
        outputs,
        check_multimapFunction=default_check_multimap):
    #Function creates new Fastq file from inFastq where kept only reads that are
    #in accordance with alignment class (based on inSam)

    #inFastq - input Fastq filename
    #inSam - referemce Sam filename
    #outputs - dict
    #alignment_class : handler
    #where alignment_classes are
    #1 = "unique"
    #2 = "repeats"
    #0 = "unmapped"
    #and handler is either file handler or
    #None to throw away reads
    #
    #see analyze_sam_file for check_multimapFunction parameter

    startLog(None)
    alignment_results = analyze_sam_file(
        inSam, check_multimapFunction=check_multimapFunction)
    stats = {}
    for i in [0, 1, 2]:
        if not i in outputs.keys():
            outputs[i] = None
        else:
            stats[i] = 0

    with open(inFastq, "r") as fin:
        for line in fin:
            alignment_class = alignment_results[line[1:].strip().split()[0]]
            if outputs[alignment_class] != None:
                stats[alignment_class] += 1
                outputs[alignment_class].write(line)
                outputs[alignment_class].write(fin.next())
                outputs[alignment_class].write(fin.next())
                outputs[alignment_class].write(fin.next())
            else:
                fin.next()
                fin.next()
                fin.next()

    logging.info(
        "Fastq " + inFastq + " filtered:\n" +
        "\n".join([str(i) + ":\t" + str(j) for i, j in stats.iteritems()]))
Example #4
0
    def __init__(self,
                 inFastq,
                 outSam,
                 enzymeName,
                 replaceFiles=False,
                 aligner_function=None,
                 aligner_parameters=None,
                 logFileName="HiCmapper.log"):
        #inFastq - input fastq file with reads file name
        #outSam - file name of the sam file for alignment
        #aligner_function - function which call aligning software. Function must accepts agrs inFastq and outSam. Optional
        #aligner_parameters - list of additional parameters to be sent to aligner. Paramteres to pass to alignment software
        #example: "--very-sencitive -x path-to-file -p 8"
        #enzymeName - name of RE enzyme used for Hi-C data
        #replaceFiles - replace output and temp files
        #logFileName - file name for logs

        startLog(logFileName, loglevel=logging.DEBUG)
        self.statistics = {}  #TODO intrpduce special statistics class
        if aligner_function != None:
            self.perform_alignmen == aligner_function

        self.aligner_parameters = aligner_parameters

        assert enzymeName in Bio.Restriction.Restriction_Dictionary.rest_dict.keys(
        )
        self.enzyme = eval(
            "Bio.Restriction." + enzymeName
        )  #eval is dangerous. Always use check of str that you pass
        self.enzymeSeq = self.enzyme.site.upper()
        st = self.enzyme.elucidate().find(
            "^")  #example: EcoRI.elucidate() returns 'G^AATT_C'
        end = self.enzyme.elucidate().find("_")
        self.enzymeFilledInSeq = (self.enzymeSeq[:end - 1] +
                                  self.enzymeSeq[st:]).upper()
        logging.debug("Uing filled-in enzyme seq: " + self.enzymeFilledInSeq)

        self.inFastq = inFastq
        self.outSam = outSam
        self.replace_files = replaceFiles

        #hic_check_multimap is a function to check wheather read has multiple alignments
        #this function should be defined separately for each aligner
        if not hasattr(self, 'hic_check_multimap'):
            logging.warning(
                "Using default function to check multiple alignments")
            self.hic_check_multimap = default_check_multimap
import numpy as np
import sys
sys.path.append("/mnt/storage/home/vsfishman/tmp/HiC_repeats/scripts")
from miscellaneous import startPmDebug,startLog
from NGSseq_utils import *
from Hi_C_mapper import *
startPmDebug()

basedir = "/mnt/storage/home/vsfishman/tmp/HiC_repeats/out/Battulin2015/"

uniqueSam_R1 = basedir+"2500k_bowtie2_R1.sam"
uniqueSam_R2 = basedir+"2500k_bowtie2_R2.sam"
guesResults = basedir+"2500k.unique.fq.trim30.fq.realigned.coords"
guesResults = basedir + "2500k"+".unique.fq"+".trim30.fq"+"R2Untrimmed.paired.coords"

startLog(filename=None)
mapper = Cbowtie2_Hi_C_mapper("","","HindIII",logFileName=None)

reads_pairs = {}

for line in open(guesResults):
	line = line.strip().split()
	reads_pairs[line[0]] = map(int,line[1:])

dist1 = []
	
r1notFound = 0
r2notFound = 0
	
for r1,status in sam_file_analyzer_generator(uniqueSam_R1,check_multimapFunction=mapper.hic_check_multimap):
	if status == 1: