Exemple #1
0
    def clean(self):
        super(Part3Form, self).clean()

        numToMutate = self.cleaned_data.get("numToMutate")

        sequenceF = self.cleaned_data.get("sequenceF")
        sequenceS = self.cleaned_data.get("sequenceS")
        if sequenceF:
            sequence = mc.FastaFile(sequenceF, fileName=False)
        elif sequenceS:
            sequence = mc.FastaFile(sequenceS, fileName=False)
        else:
            sequence = None

        if sequence:

            allowedLetters = set(myf.mapperDict.keys() + ["N"])

            for i in sequence:
                if len(set(i.sequence) - set(allowedLetters)) > 0:
                    raise forms.ValidationError("Invalid letters!")

                if list(set(i.sequence)) == ["N"]:
                    raise forms.ValidationError(
                        "Please do not enter a string only containing N!")

            if numToMutate > sequence.getMinLength():
                raise forms.ValidationError(
                    "The number of mutations is greater than the length of the smallest sequence"
                )
Exemple #2
0
    def clean(self):

        super(FastaForm, self).clean()
        fastaFile = self.cleaned_data.get("sequenceF")
        fastaSequence = self.cleaned_data.get("sequenceS")

        if fastaFile == None:
            pass
        elif fastaSequence == None:
            pass
        elif fastaFile == "" and fastaSequence == "":
            raise forms.ValidationError(
                "Enter a sequence ! Either upload or enter in directly!")
        elif fastaFile and fastaSequence:
            raise forms.ValidationError(
                "Either upload or enter in directly ! Don't do both!")

        if fastaFile:
            sequence = mc.FastaFile(fastaFile, fileName=False)
        elif fastaSequence:
            sequence = mc.FastaFile(fastaSequence, fileName=False)
        else:
            sequence = None
        if sequence:
            if len(sequence) > self.sequenceLimit:
                raise forms.ValidationError(
                    "Currently only a maximum of %s sequences are allowed" %
                    self.sequenceLimit)
            if sequence.getMaxLength() > self.sequenceLengthLimit:
                raise forms.ValidationError(
                    "Currently a sequence can only be %s long." %
                    self.sequenceLengthLimit)
Exemple #3
0
def getMotifAndSequenceObjects(motifF, motifS, sequenceF, sequenceS):

    if motifF:
        motifO = mc.FastaFile(motifF, fileName=False)
    elif motifS:
        motifO = mc.FastaFile(motifS, fileName=False)

    if sequenceF:
        sequenceO = mc.FastaFile(sequenceF, fileName=False)
    elif sequenceS:
        sequenceO = mc.FastaFile(sequenceS, fileName=False)

    return sequenceO, motifO
Exemple #4
0
def getSequenceView(request):
    fastaFile = request.GET.get('fastaFile')
    sampleFastaB = request.GET.get('sampleFastaFile')

    response = HttpResponse(content_type="text/plain")
    base_dir = settings.BASE_DIR
    if not sampleFastaB:
        response = HttpResponse(content_type="text/plain")
        openfile = open(os.path.join(base_dir,
                                     "fasta_files/%s.fa" % fastaFile))
        readfile = openfile.read()
        openfile.close()
        fastaO = mc.FastaFile(readfile, fileName=False)
        html = ""
        for i in fastaO:
            html += "<input type='checkbox' value='>%s\n%s'>&gt;%s<br>%s<br><br>" % (
                i.name, i.sequence, i.name, i.sequence)

        response.write(html)
    else:
        openfile = open(os.path.join(base_dir, "sample_fasta.fa"))
        readfile = openfile.read()
        openfile.close()
        response.write(readfile)

    return response
Exemple #5
0
def get_sequences(fasta):
    filed = fasta
    sequenceO = mycustom.FastaFile(filed)
    sequencesL = [i.sequence.upper() for i in sequenceO]
    sequencesL_rev_compl = []
    for i in sequencesL:
        seq = Seq(i)
        sequencesL_rev_compl += [str(seq.reverse_complement())]
    return sequencesL, sequencesL_rev_compl
Exemple #6
0
    def clean(self):
        super(Part1Form, self).clean()

        minSpacing = self.cleaned_data['minSpacing']
        maxSpacing = self.cleaned_data['maxSpacing']
        leftDistance = self.cleaned_data['minSpacing']
        rightDistance = self.cleaned_data['maxSpacing']
        minimumGCContent = self.cleaned_data['minimumGCContent']
        maximumGCContent = self.cleaned_data['maximumGCContent']
        motifS = self.cleaned_data.get('motifS')
        sequenceF = self.cleaned_data.get('sequenceF')
        sequenceS = self.cleaned_data.get('sequenceS')

        if motifS == "":
            raise forms.ValidationError("Enter motifs!")
        if motifS:
            motifO = mc.FastaFile(motifS, fileName=False)
            if motifO.areDuplicatesPresent():
                raise forms.ValidationError("There are duplicate motifs!")

        maximumNumberOfMotifsTimesSequences = 100

        if sequenceF or sequenceS:
            if sequenceF:
                sequenceO = mc.FastaFile(sequenceF, fileName=False)
            elif sequenceS:
                sequenceO = mc.FastaFile(sequenceS, fileName=False)

            if not sequenceO.lengthsSame():
                raise forms.ValidationError(
                    "Sizes of the sequences should be the same")

        if (len(motifO)**
                4) * len(sequenceO) > maximumNumberOfMotifsTimesSequences:
            raise forms.ValidationError(
                "Only a maximum of %s motifs^4*sequences allowed" %
                maximumNumberOfMotifsTimesSequences)

        if minSpacing > maxSpacing:
            raise forms.ValidationError(
                "Maximum spacing should be greater than min spacing")
        elif minimumGCContent > maximumGCContent:
            raise forms.ValidationError(
                "Minimum GC content is larger than maximum GC content")
Exemple #7
0
    def test_One(self):
        """
        Testing whether the Fasta class parses Fasta files properly.
        There are 2 tests here.

        :return:
        """
        a = mc.FastaFile(self.sampleFastas, fileName=False)
        self.assertEqual(a[0].sequence, "AGAGATACATAGACAATGTGTTGCGTAGAGATAG")
        self.assertEqual(a[1].sequence, "TTTTGGAA")
        self.assertEqual(len(a), 2)
import re,os,sys,glob
import mycustom
import ushuffle
from Bio import SeqIO
import pdb

names=[]
sequencesL=[]

# path with the fasta file to be simulated 
filed = "/nfs/compgen-04/team218/ilias/nullomers_hg38_v2/hg38.fa"
sequenceO = mycustom.FastaFile(filed)
sequencesL = [ i.sequence.upper() for i in sequenceO ]
names = [ i.name.upper() for i in sequenceO ]

# Number of simulations
for k in range(1,101):
        datafile=open("sims_genome_dinucleotide/hg38_bootstrap_number_"+str(k)+"_controlling_dinucleotide_content.fasta","w")
        sequencesL_c=[]
        for index,i in enumerate(sequencesL):
                seq_random=ushuffle.shuffle(i,len(i), 2)
                datafile.write(">"+names[index]+'_control_bootstrap_'+str(k)+'\n')
                datafile.write(seq_random+'\n')
        datafile.close()
Exemple #9
0
def resultsView(request):
    context = {}
    if request.method == "POST":

        part1Form = Part1Form(request.POST, request.FILES)
        if not part1Form.is_valid():
            context['part1form'] = part1Form
            context['boxes'] = ['restriction', 'adapter']

            return render(request, "iliasApp/part1.html", context)

        ordering = request.POST.get('ordering').strip().split(",")[:-1]

        postDict = request.POST
        sequenceS = part1Form.cleaned_data['sequenceS']
        sequenceF = part1Form.cleaned_data['sequenceF']
        motifS = part1Form.cleaned_data['motifS']

        reverseComplement = part1Form.cleaned_data['reverseComplement']
        leftDistance = int(part1Form.cleaned_data['leftDistance'])
        rightDistance = int(part1Form.cleaned_data['rightDistance'])
        frequencyOfInsertion = int(
            part1Form.cleaned_data['frequencyOfInsertion'])
        minSpacing = int(part1Form.cleaned_data['minSpacing'])
        maxSpacing = int(part1Form.cleaned_data['maxSpacing'])

        barCodeDistance = int(part1Form.cleaned_data.get(
            'barCodeDistance')) if postDict.get('barCodeDistance') else None
        barCodeLength = int(part1Form.cleaned_data.get(
            'barCodeLength')) if postDict.get('barCodeLength') else None
        minimumGCContent = postDict.get('minimumGCContent')
        maximumGCContent = postDict.get('maximumGCContent')
        numOfBarCodesPerSequence = int(
            postDict['numOfBarCodesPerSequence']) if postDict.get(
                "numOfBarCodesPerSequence") else None

        restriction1 = postDict.get('restriction1')
        restriction2 = postDict.get('restriction2')

        adapter1 = postDict.get('adapter1')
        adapter2 = postDict.get('adapter2')

        motifO = mc.FastaFile(motifS, fileName=False)

        if sequenceF:
            sequenceO = mc.FastaFile(sequenceF, fileName=False)
        elif sequenceS:
            sequenceO = mc.FastaFile(sequenceS, fileName=False)

        motifsL = [motif.sequence for motif in motifO]
        allCombinations = part1.generateCombinations(motifsL)

        # This is only working for 20 sequence for now. CHANGE THIS
        numOfSequencesToUse = 20
        backgroundSequencesL = [
            i.sequence[:800] for i in sequenceO[:numOfSequencesToUse]
        ]
        backgroundSequenceHeadersL = [
            i.name for i in sequenceO[:numOfSequencesToUse]
        ]

        # doing the reverse complement
        if reverseComplement:
            copyBackgroundSequencesL = backgroundSequencesL[:]
            backgroundSequencesL = [
                myf.revcompl(backgroundSequence).lower()
                for backgroundSequence in copyBackgroundSequencesL
            ]

        # getting the combinations
        finalOutput = []
        for index, backgroundSequence in enumerate(backgroundSequencesL):
            for combination in allCombinations:
                finalOutput += oligo.oligo(backgroundSequence, minSpacing,
                                           maxSpacing, combination,
                                           leftDistance, rightDistance,
                                           frequencyOfInsertion,
                                           backgroundSequenceHeadersL[index])

        # creating the barcodes. It can be a none value
        barCodes, numOfBarCodesPerSequence = part1.getBarCodes(
            barCodeLength, minimumGCContent, maximumGCContent,
            numOfBarCodesPerSequence, barCodeDistance, finalOutput)

        mpraOutput, sequenceHTMLL = part1.createMPRAResultOutput(
            finalOutput, numOfBarCodesPerSequence, barCodes, restriction1,
            restriction2, adapter1, adapter2, ordering)

        usingDownload = request.POST.get('usingDownload', False)

        if usingDownload:
            response = HttpResponse(content_type="text/plain")
            response.write(mpraOutput)
            # context = {"backgroundSequence": sequenceS, "motif": motif, "allCombinations": allCombinations, "finalOutput": finalOutput, "barCodes" : barCodes}
            return response

        else:
            response = HttpResponse(content_type="text/plain")
            response.write(mpraOutput)
            context['sequenceHTML'] = sequenceHTMLL
            context['forDownload'] = mpraOutput
            context['fileName'] = 'MPRA_Motif_results.txt'
            return render(request, "iliasApp/results.html", context)

    return HttpResponseRedirect(urlresolvers.reverse(("iliasApp:ViewIndex")))
Exemple #10
0
def part3RresultsView(request):
    context = {}
    if request.method == "POST":
        form = Part3Form(request.POST, request.FILES)

        if form.is_valid():
            pass
        else:
            context['form'] = form
            return render(request, "iliasApp/part3.html", context)

        sequenceS = form.cleaned_data.get('sequenceS')
        sequenceF = form.cleaned_data.get('sequenceF')

        scrambleOption = request.POST.get('scramble')
        reverseOption = request.POST.get('reverse')
        compOption = request.POST.get('complement')

        numToMutate = int(form.cleaned_data.get('numToMutate'))

        if sequenceF:
            sequenceO = mc.FastaFile(sequenceF, fileName=False)
        elif sequenceS:
            sequenceO = mc.FastaFile(sequenceS, fileName=False)

        outputSequenceL = [i.sequence for i in sequenceO]

        scrambleHeader = "No"
        reverseHeader = "No"
        complementHeader = "No"

        if scrambleOption == "on":
            scrambleHeader = "Yes"
            outputSequenceL = [
                part3.scramble_motifs(seq) for seq in outputSequenceL
            ]

        if reverseOption == "on":
            reverseHeader = "Yes"
            outputSequenceL = [seq[::-1] for seq in outputSequenceL]

        if compOption == "on":
            complementHeader = "Yes"
            outputSequenceL = [myf.complement(seq) for seq in outputSequenceL]

        finalOutputSequenceL = outputSequenceL
        outputSequenceHTMLL = outputSequenceL

        if numToMutate:
            finalOutputSequenceL = []
            outputSequenceHTMLL = []

            for seq in outputSequenceL:
                mutatedString, positionMutated = part3.mutateString(
                    seq, numToMutate)
                finalOutputSequenceL.append(mutatedString)

                outputSequenceHTMLL.append(
                    myf.highlightString(mutatedString, positionMutated))

        headers = [
            ">" + seq.name +
            "| Mutated_nucleotides - %s | Scrambled - %s | Reversed - %s | Complemented - %s"
            % (numToMutate, scrambleHeader, reverseHeader, complementHeader)
            for seq in sequenceO
        ]

        context['headers'] = headers
        context['zipped'] = zip(headers, outputSequenceHTMLL)

        forDownload = ""
        for header, scramble in zip(headers, finalOutputSequenceL):
            forDownload += header + '\n'
            forDownload += scramble + '\n'
        # YOU NEED TO add new lines as the sequence will be displayed
        context['forDownload'] = forDownload
        context['fileName'] = "Transmutation_results.txt"

        return render(request, "iliasApp/part3Results.html", context)

    else:
        return HttpResponseRedirect(
            urlresolvers.reverse(("iliasApp:ViewPart3")))
Exemple #11
0
# 32 jobs ( 1 motif per job)
jobNumber = int(sys.argv[1])
motifsPerJob = 1
lowerBound = (jobNumber - 1) * motifsPerJob
upperBound = jobNumber * motifsPerJob

#The genomes to scan for the motif occurrences, finds motifs in the plus orientation of the genome
files = glob.glob(
    "/lustre/scratch117/cellgen/team218/igs/properties/hg19/All_chr_hg19.fa")
#read the genomic files, in this case only the human genome hg19
for filed in files:
    print "This is the input argument: %s " % jobNumber

    # reading the sequence file
    sequenceO = mycustom.FastaFile(filed)
    sequencesL = [i.sequence for i in sequenceO]
    del sequenceO

    # reading the motif file, provide the path to the motifs
    motifO = mycustom.FastaFile("polyN1.fa")
    motifsL = [i.sequence for i in motifO[lowerBound:upperBound]]
    del motifO

    print "lowerbound is ", lowerBound
    print "upperbound is ", upperBound

    # now finding the sequences
    result = Motif_combinatorics.findAllMotifAllSeqs(motifsL, sequencesL)

    #provide path to output here"
Exemple #12
0
    'W': 'W',
    'S': 'S',
    'R': 'Y',
    'Y': 'R'
}[B] for B in x][::-1])

if 1 == 1:
    # provide path to the two json files generated for each strand
    json_file1 = "All_chr_hg19_di_same_strand_n1.json"
    json_file2 = "All_chr_hg19_di_opposite_strand_n1.json"

    # provide path to genome
    seqFileName1 = "/lustre/scratch117/cellgen/team218/igs/properties/hg19/All_chr_hg19.fa"
    seqFileName2 = "/lustre/scratch117/cellgen/team218/igs/properties/hg19/All_chr_hg19.fa"

    fastaO = mc.FastaFile(seqFileName1)
    SequencesL1 = [i.sequence for i in fastaO.getSequences()]
    NamesL1 = [i for i in fastaO.getNames()]
    sequence_length1 = len(SequencesL1[0])
    nucs_seq1 = sequence_length1 * len(SequencesL1)

    fastaO = mc.FastaFile(seqFileName2)
    SequencesL2 = [i.sequence for i in fastaO.getSequences()]
    NamesL2 = [i for i in fastaO.getNames()]
    sequence_length2 = len(SequencesL2[1])
    nucs_seq2 = sequence_length2 * len(SequencesL2)

    # provide file of polyN motifs (e.g. di-nucleotides or mono-nucleotides)
    motifO = mc.FastaFile("polyNs_di.fa")
    consensusL = [i.sequence for i in motifO.getSequences()]
    MotifsL = consensusL