Beispiel #1
0
def run_pam_finder(target_fa, seq, PAM, abs_start_pos, chr):

    # SeqUtils.nt_search("AGGCGGGGG", "NGG")
    # SeqUtils.nt_search("CCACCA", "NGG")
    # forward
    rev_seq = revcomp(target_fa)
    fwd_search = SeqUtils.nt_search(target_fa, seq + PAM)
    rev_search = SeqUtils.nt_search(rev_seq, seq + PAM)
    out = []
    if len(fwd_search) > 1:
        for s in fwd_search[1:]:
            # out.append([chr,s+abs_start_pos,s+abs_start_pos+len(seq),target_fa[s:(s+len(seq))],".","+"])
            out.append([
                chr, s + abs_start_pos, s + abs_start_pos + len(seq),
                target_fa[s:(s + len(seq))],
                target_fa[s:(s + len(seq) + len(PAM))], "+"
            ])
    if len(rev_search) > 1:
        for s in rev_search[1:]:
            # out.append([chr,(len(target_fa)-s)+abs_start_pos-len(seq),(len(target_fa)-s)+abs_start_pos,rev_seq[s:(s+len(seq))],".","-"])
            out.append([
                chr, (len(target_fa) - s) + abs_start_pos - len(seq),
                (len(target_fa) - s) + abs_start_pos,
                rev_seq[s:(s + len(seq))],
                rev_seq[s:(s + len(seq) + len(PAM))], "-"
            ])
    return pd.DataFrame(out)
Beispiel #2
0
def count_amplicons(in_name, fprimer, rc):
    Fprimer = Seq(fprimer, IUPAC.ambiguous_dna)
    pre_length = Counter()
    if rc:
        post_length = Counter()
        bothfound = 0
        Rprimer = Seq(fprimer, IUPAC.ambiguous_dna).reverse_complement()
        lenRprimer = len(Rprimer)

    with open(in_name, 'r') as fastqF:
        for seqRecord in SeqIO.parse(fastqF, "fastq"):
            Fpos = SeqUtils.nt_search(str(seqRecord.seq), str(Fprimer))
            if len(Fpos) > 1:
                # SeqUtils.nt_search returns the pattern, followed by positions of any matches
                # Forward primer found: increment pre_length
                pre_length[Fpos[1]] += 1
            if rc:
                RCpos = SeqUtils.nt_search(str(seqRecord.seq), str(Rprimer))
                if len(RCpos) > 1:
                    tail = len(seqRecord) - RCpos[-1] - lenRprimer
                    post_length[tail] += 1
                    if len(Fpos) > 1:
                        bothfound += 1

    print("Primers found:", sum(pre_length.values()))
    print("Counts of pre_length:", pre_length)
    if rc:
        print("Reverse primers found:", sum(post_length.values()))
        print("Counts of post_length", post_length)
        print("Both primer and reverse_complement found:", bothfound)
Beispiel #3
0
    def filtByPrimer(self, fwd_primer, rvs_primer):
        with open(self.input_forward) as fh:
            with open(self.input_reverse) as rh:

                count_keep = 0
                count_discard = 0

                for ((title_f, seq_f, qual_f),
                     (title_r, seq_r,
                      qual_r)) in zip(FastqGeneralIterator(fh),
                                      FastqGeneralIterator(rh)):

                    try:
                        if (SeqUtils.nt_search(seq_f, fwd_primer)[1] == 0) & (
                                SeqUtils.nt_search(seq_r, rvs_primer)[1] == 0):
                            with open(self.output_forward, 'a') as ofh:
                                ofh.write(
                                    '@' +
                                    '\n'.join([title_f, seq_f, '+', qual_f]) +
                                    '\n')
                            with open(self.output_reverse, 'a') as orh:
                                orh.write(
                                    '@' +
                                    '\n'.join([title_r, seq_r, '+', qual_r]) +
                                    '\n')
                            count_keep += 1
                        else:
                            count_discard += 1
                    except IndexError:
                        count_discard += 1
        print('     Number of reads saved: ' + str(count_keep))
        print('     Number of reads discard :' + str(count_discard))
Beispiel #4
0
def search_motif(sequence):

    motif = str(args.pam)

    len_motif = int(len(motif))

    len_protospacer = int(args.length_protospacer)

    full_len = len_motif + len_protospacer

    len_dna = int(len(sequence.seq))

    # Output of nt_search is a list containing the motif and the start position (0-based)
    # of every hit in the DNA sequence

    # Search on fw strand
    matches_fw = SeqUtils.nt_search(str(sequence.seq), motif)

    # Initialyze final list
    coordinates_fw = []

    if len(matches_fw) > 1:
        end_positions_fw = matches_fw[1::]
        start_positions_fw = [
            end - len_protospacer for end in end_positions_fw
        ]

        # Check if protospacer fits in the sequence before adding the start
        # and end coordinate to the list
        for start, end in zip(start_positions_fw, end_positions_fw):
            if start > 0:
                coordinates_fw.append([start, end])

    # The coordinates are different and need to be corrected to match to fw strand
    reverse_seq = str(sequence.seq.reverse_complement())

    matches_rv = SeqUtils.nt_search(reverse_seq, motif)

    # Initialyze final list
    coordinates_rv = []

    if len(matches_rv) > 1:
        end_positions_rv = matches_rv[1::]
        start_positions_rv = [
            end - len_protospacer for end in end_positions_rv
        ]
        # Need to convert the coordinates in forward strand
        end_positions = [len_dna - start for start in start_positions_rv]
        start_positions = [len_dna - end for end in end_positions_rv]

        # Check if protospacer fits in the sequence before adding the start
        # and end coordinate to the list
        for start, end in zip(start_positions, end_positions):
            if start > 0 and end < len_dna:
                coordinates_rv.append([start, end])

    # Return a tuple of lists for fw and rv matches
    return coordinates_fw, coordinates_rv
Beispiel #5
0
def writePBS():
    global variation, seqRecordToCheck, seqRecordToCheckComplement, difference, newFeature
    for variation in featureStatistic_container[feature]:
        primerSeq = str(variation.seq)
        primerName = variation.note

        partialPrimerSeq = primerSeq[len(primerSeq) - 15::]
        seqRecordToCheck = str(record.seq)
        seqRecordToCheckComplement = str(reverse_complement(record.seq))

        matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheck, partialPrimerSeq)

        if (len(matchingPrimerPositions) > 1):
            difference = len(primerSeq) - len(partialPrimerSeq)
            length = len(matchingPrimerPositions)
            for j in range(1, length):
                if primerSeq == seqRecordToCheck[matchingPrimerPositions[j] -
                        difference: matchingPrimerPositions[j] - difference + len(primerSeq)]:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j],
                                                            matchingPrimerPositions[j] + len(primerSeq),
                                                            strand=1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)

                else:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition(
                        matchingPrimerPositions[j] + len(primerSeq)), strand=1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)

        matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheckComplement, partialPrimerSeq)

        if (len(matchingPrimerPositions) > 1):
            difference = len(primerSeq) - len(partialPrimerSeq)
            length = len(matchingPrimerPositions)
            for j in range(1, length):
                if primerSeq == seqRecordToCheckComplement[matchingPrimerPositions[j] -
                        difference: matchingPrimerPositions[j] - difference + len(primerSeq)]:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j],
                                                            matchingPrimerPositions[j] + len(primerSeq),
                                                            strand=-1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)
                else:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition(
                        matchingPrimerPositions[j] + len(primerSeq)), strand=-1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)
Beispiel #6
0
    def _find_iseq(self,
                   seq: Seq,
                   iseq_str: str,
                   iseq_id: str = "integrated sequence") -> int:
        """The Function to find index/location of iseq_str within the sequence.

        Args:
            seq: Sequence to search.
            iseq_str: The subsequence you are searching for.
            iseq_id (optional): The id/name of the subsequence
                (iseq_str), Defaults to "integrated sequence".

        Returns:
            int: The index/location of iseq within sequence.

        Raises:
            PartException: If iseq_str can not be found within the sequence,
                if multiple iseq_str exist within the sequence.
        """
        search_out = SeqUtils.nt_search(str(seq), iseq_str)
        if len(search_out) < 2:
            raise PartException(f"{self.id} lacks {iseq_id}")
        elif len(search_out) > 2:
            raise PartException(f"{self.id} contains multiple {iseq_id}")
        return search_out[1]
def main():
    """Main application body"""
    # Genome sequence and annotations
    genome = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/fasta/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like_Genome.fasta')
    annotations = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/gff/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like.gff')

    # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013)
    motifs = load_motifs('najafabadi_table_s1_2013.csv')

    # Load genome sequence
    chromosomes = load_fasta(genome)

    # Parse annotations and return 3'UTR coordinates
    genes = get_utr_coords(annotations, utr_length=500)

    # For each gene, return a list of the motifs that are present in its 3'UTR
    for gene in genes:
        utr_seq = get_3utr_seq(chromosomes, gene)

        # check each motif to see if it is present
        utr3_motifs = []

        for motif in motifs:
            matches = SeqUtils.nt_search(utr_seq, motif)[1:]

            # save matched motif
            if len(matches) > 0:
                utr3_motifs.append(motif)

        # output results
        print("%s: %s" % (gene['id'], ", ".join(utr3_motifs)))
def parseSeqRecordForOligo(record,oligo):
    '''Parse SeqRecord for oligo and return True if found and False if not.'''
    results = SeqUtils.nt_search(str(record.seq),oligo) #search in SeqRecord sequence for oligo
    if (len(results) > 1):
        return True #if list > 1 item, a match position was found
    else: #print "Did NOT find %s in %s" % (ol.id, record.id)
        return False
Beispiel #9
0
def find_PAM(seq, PAM):
    try:
        PAM_index = seq.index(PAM)
    except:
        # PAM on the left
        left_search = SeqUtils.nt_search(seq[:len(PAM)], PAM)
        if len(left_search) > 1:
            PAM_index = left_search[1]
        else:
            right_search = SeqUtils.nt_search(seq[-len(PAM):], PAM)
            if len(right_search) > 1:
                PAM_index = len(seq) - len(PAM)
            else:
                print("PAM: %s not found in %s. Set PAM index to 20" %
                      (PAM, seq))
                PAM_index = 20
    return PAM_index
Beispiel #10
0
def is_dPAM(PAM_seq, RTT, cut_offset=-3):
    # Assuming no N is RTT, which should be true
    # match PAM seq to RTT, should be abs(cut_offset)
    # print (PAM_seq, RTT)
    # will need to do revcomp no matter what, because RTT is always xxxxxxxPAM

    seq = revcomp(RTT)
    fwd_search = SeqUtils.nt_search(seq, PAM_seq)
    flag = 1
    if len(fwd_search) > 1:
        if abs(cut_offset) in fwd_search:
            flag = 0

    return flag
def main():
    """Main application body"""
    # Parse command-line arguments
    args = parse_args()

    # Genome sequence and annotations
    genome = load_file(args.input_genome)
    annotations = load_file(args.input_annotations)

    # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013)
    motifs = load_motifs('najafabadi_table_s1_2013.csv')

    # Load genome sequence
    chromosomes = load_fasta(genome)

    # Parse annotations and return 3'UTR coordinates
    genes = get_utr_coords(annotations, utr_length=args.utr_length)

    # Create a list to store output rows
    output = []

    # For each gene, return a list of the motifs that are present in its 3'UTR
    num_genes = len(genes)

    for i, gene in enumerate(genes):
        utr_seq = get_3utr_seq(chromosomes, gene)

        print('Processing gene %d/%d' % (i + 1, num_genes))

        # check each motif to see if it is present
        utr3_motifs = []

        for motif in motifs:
            matches = SeqUtils.nt_search(utr_seq, motif)[1:]

            # save matched motif
            if len(matches) > 0:
                utr3_motifs.append(motif)

        output.append([gene['id']] + utr3_motifs)

    # output results
    with open(args.output, 'w') as output_file:
        writer = csv.writer(output_file)
        writer.writerows(output)
Beispiel #12
0
def annotate_primer(primer_name, primer_seq, primer_direction, genome):
    if type(primer_seq) == SeqRecord:
        primer_seq = primer_seq.seq
    if primer_direction == -1:
        primer_seq = primer_seq.reverse_complement()

    primer_label = PRIMER_ANNOTATION_PREFIX + primer_name
    primer_genome_loc_start = SeqUtils.nt_search(
        str(genome.seq), str(primer_seq))[1]
    primer_genome_loc = FeatureLocation(
        primer_genome_loc_start,
        primer_genome_loc_start+len(primer_seq))
    primer_feature = SeqFeature(
        location=primer_genome_loc, type='misc_feature',
        strand=primer_direction,
        qualifiers={'label': [primer_label]})

    genome.features.append(primer_feature)
def digest(enzyme, sequence, outfile, count):
	# search input sequence using enzyme sequence and return results to 'matches'
	matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1])

	# for each of the items in results 'matches' list from 2nd item on (first item is match string)
	for match in matches[1:]:
		# create line for match on query stand
		line1 = sequence.id+"\t"+`int(match)+int(enzyme[2])`+"\t"+`int(match)+int(enzyme[2])`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t+\n"
		# look for reverse complement
		line2 = sequence.id+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t-\n"

		# if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted
		if len(enzyme[1])/2 < int(enzyme[2]):
			outfile.write(line2+line1)
		# if cut site is not past halfway point in enzyme, we can output in logical order
		else:
			# write both lines to ouput
			outfile.write(line1+line2)
		
		count += 1
	return count
def digest(enzyme, sequence, outfile, count):
	# search input sequence using enzyme sequence and return results to 'matches'
	matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1])

	# for each of the items in results 'matches' list from 2nd item on (first item is match string)
	for match in matches[1:]:
		# create line for match on query stand
		line1 = sequence.id+"\t"+str(int(match)+int(enzyme[2]))+"\t"+str(int(match)+int(enzyme[2]))+"\t"+enzyme[0]+"\tcut-"+str(count)+"\t+\n"
		# look for reverse complement
		line2 = sequence.id+"\t"+str(int(match)+int(len(enzyme[1])-int(enzyme[2])))+"\t"+str(int(match)+int(len(enzyme[1])-int(enzyme[2])))+"\t"+enzyme[0]+"\tcut-"+str(count)+"\t-\n"

		# if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted
		if len(enzyme[1])/2 < int(enzyme[2]):
			outfile.write(line2+line1)
		# if cut site is not past halfway point in enzyme, we can output in logical order
		else:
			# write both lines to ouput
			outfile.write(line1+line2)
		
		count += 1
	return count
def split_relabel(infile, gene, m, sample_IDs, output_handle):
    ## Set up logfile
    log = open("Primer_split_log.txt", "a")
    log.write("Reads file\tPrimer\tRead count\n")
    if m == "merged":
        label = re.split('Wx80_|_pear', infile)[1]
    elif m == "unmerged":
        label = re.split('Wx80_|_L001', infile)[1]
    reads_handle = open(infile)
    #reads_handle = gzip.open(fastq_file) # Use if files are gzipped
    well = re.split('_S', label)[0]  # NZGL well number
    sample_ID = sample_IDs.get(well)  # Corresponding sample ID
    count = 0
    primer_f = primerlist.get(gene)[0]
    primer_r = primerlist.get(gene)[1]
    trimmed = ""
    for record in SeqIO.parse(reads_handle, "fastq"):
        primer_search = SeqUtils.nt_search(str(
            record.seq), primer_f)  # Searches record.seq for primer
        if len(primer_search) > 1 and (
                primer_search
        )[1] == 0:  # Check if primer found (len > 1) at start of sequence ([1] == 0)
            if m == "merged":
                trimmed = record[len(primer_f):-len(primer_r)]
            elif m == "unmerged":
                trimmed = record[len(primer_f):]
            trimmed.id = (
                ("{0}|gene_{1}|sample_{2}").format(trimmed.id, gene, sample_ID)
            )  # Adds gene and sample ID
            print(trimmed.id)
            print(len(trimmed.seq))
            SeqIO.write(trimmed, output_handle, "fastq")
            count += 1
            print("{0} {1} {2} {3}".format(gene, label, sample_ID, count))
    print("{0} {1}: Saved {2} reads".format(label, gene, count))
    log.write("{0} {1} {2}: {3}".format(gene, label, sample_ID, count))
    reads_handle.close()
    #output_handle.close()
    log.close()
Beispiel #16
0
                    help='Motif fasta file')
parser.add_argument('--search',
                    '-s',
                    required=True,
                    help='Sequence fasta file to search.')
parser.add_argument('--outfile',
                    '-o',
                    type=argparse.FileType('w', encoding='UTF-8'),
                    required=True,
                    help='Outfile (will be in bed format).')
args = parser.parse_args()

d = []
for seq_motif in SeqIO.parse(args.motif, "fasta"):
    for seq in SeqIO.parse(args.search, "fasta"):
        results = SeqUtils.nt_search(str(seq.seq), seq_motif.seq)
        results_rc = SeqUtils.nt_search(str(seq.seq),
                                        seq_motif.seq.reverse_complement())
        for i in results[1:]:
            d.append({
                'search': seq.id,
                'motif': results[0],
                'first': i + 1,
                'last': (i) + len(results[0])
            })
        for i in results_rc[1:]:
            d.append({
                'search': seq.id,
                'motif': results_rc[0],
                'first': i + 1,
                'last': (i) + len(results_rc[0])
 def map_locator_Spark(x, subsequence):
     return len(SeqUtils.nt_search(x[1], subsequence)) > 1
from Bio import SeqIO	
from Bio.Alphabet import IUPAC		
from Bio.Seq import Seq
from Bio import motifs			
from Bio import SeqUtils


with open("sites/MA0106.1.sites") as handle:
     p53 = motifs.read(handle, "sites")

motif = p53.degenerate_consensus

with open("output/motif_result_p53.txt","w") as f:
	for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'):
		f.write(">" + str(seq_record.id) + "\n")
		result=SeqUtils.nt_search(str(seq_record), m)
		f.write(str(result) + "\n")

##

with open("sites/MA0001.1.sites") as handle:
     AGL3 = motifs.read(handle, "sites")

motif = AGL3.degenerate_consensus

with open("output/motif_result_AGL3.txt","w") as f:
	for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'):
		f.write(">" + str(seq_record.id) + "\n")
		result=SeqUtils.nt_search(str(seq_record), motif)
		f.write(str(result) + "\n")
Beispiel #19
0
 def find_enzyme(input_seq):
     in_IUPAC = Seq(input_seq, alphabet=Bio.Alphabet.IUPAC.unambiguous_dna)
     for rec_seq in Restriction.data["IUPAC sequence"]:
         SeqUtils.nt_search(in_IUPAC, rec_seq)
     return
Beispiel #20
0
from Bio import SeqUtils

consensus = "RGWYV"

sequence = "CGTAGCTAGCTCAGAGCAGGGACACGTGCTAGCAACAGCGCT"

SeqUtils.nt_search(sequence, consensus)
Beispiel #21
0
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr,
                        cutoff_spacing, referenceGenomeForDAS, spacerLength,
                        distanceToCutSiteFromPAM_bp):
    from Bio import SeqFeature

    if PAMside == 3:
        distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp
        # For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp
    else:
        distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp - 1
        # For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp

    s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS)

    s = s.upper()
    PAM = Seq(seqStr, IUPAC.ambiguous_dna)
    PAM_length = len(seqStr)
    if seqStr == str(PAM.reverse_complement()):
        DoRevComp = 0
        forwardNameString = "{name}_{num:0{width}}"
    else:
        DoRevComp = 1
        forwardNameString = "{name}_F{num:0{width}}"
    listSpacer = []
    listDistBetweenSpacers = []

    spacerNum = 0
    prevStartLocInRefSeq = -9999
    if PAMside == 3:
        gbStringForSearch = s[spacerLength:]
        # Cas9
    else:
        gbStringForSearch = s[:-spacerLength]
        # Cpf1, get all but last ~20 bases of sequence

    spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM))
    if len(spacerInds) > 1:  # matches found
        del spacerInds[0]  # first result from nt_search is regexp expansion
        #print "len line below {fname}".format(fname=len(spacerInds))
        formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10)))
        print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds))

        for idx, item in enumerate(spacerInds):
            startPos = SeqFeature.ExactPosition(
                item)  # start and end pos of PAM
            endPos = SeqFeature.ExactPosition(item + PAM_length)

            if PAMside == 3:  # Cas9-like
                startLocInRefSeq = startPos + 1
                endLocInRefSeq = startLocInRefSeq + spacerLength - 1
            else:  # Cpf1-like
                startLocInRefSeq = endPos  #Starts immediately after PAM
                endLocInRefSeq = startLocInRefSeq + spacerLength

            startLocInRefGenome = chromStartRG + startLocInRefSeq
            endLocInRefGenome = chromStartRG + endLocInRefSeq - 1
            cutSiteInRefGenome = startLocInRefGenome + distanceToCutSiteFrom5pEnd

            # Only add the spacer if it is a certain distance from the previous spacer
            if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing:
                spacerNum += 1
                strand = "+"
                if spacerNum > 1:
                    distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq
                else:
                    distFromPrevSpacer = 0
                if PAMside == 3:
                    spacerAsStr = str(s[startLocInRefSeq - 1:endLocInRefSeq])
                    exactPAM = s[endLocInRefSeq:endLocInRefSeq + PAM_length]
                else:
                    spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq])
                    exactPAM = s[startLocInRefSeq -
                                 PAM_length:startLocInRefSeq]
                    # Python slices: second index is first char you *DON'T* want

                GCcontent = SeqUtils.GC(spacerAsStr)
                listItem = [
                    spacerNum, strand, startLocInRefSeq, endLocInRefSeq,
                    chromPos, startLocInRefGenome, endLocInRefGenome,
                    cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr,
                    exactPAM, GCcontent
                ]
                listSpacer.append(listItem)
                listDistBetweenSpacers.append(float(distFromPrevSpacer))
                prevStartLocInRefSeq = startLocInRefSeq

    print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(
        limit=cutoff_spacing, num=spacerNum)
    spacerNumTotal = spacerNum

    # Search rev complement of PAM
    # print PAM
    # print PAM.reverse_complement()
    prevStartLocInRefSeq = -9999
    spacerNum = 0
    if DoRevComp:
        if PAMside == 3:
            gbStringForSearch = s[:-spacerLength]
            # get all but last ~20 bases of sequence
        else:
            gbStringForSearch = s[spacerLength:]

        spacerInds = SeqUtils.nt_search(gbStringForSearch,
                                        str(PAM.reverse_complement()))
        if len(spacerInds) > 1:  # matches found
            del spacerInds[
                0]  # first result from nt_search is regexp expansion
            #print "len line below {fname}".format(fname=len(spacerInds))
            formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10)))
            print "Minus strand sgRNAs found: {num}".format(
                num=len(spacerInds))

            for idx, item in enumerate(spacerInds):
                startPos = SeqFeature.ExactPosition(item)
                endPos = SeqFeature.ExactPosition(item + PAM_length)
                #print "Start pos: {num}  End pos: {num2}".format(num=startPos,num2=endPos)

                # Start and end locations are flipped here due to reverse strand
                if PAMside == 3:
                    endLocInRefSeq = endPos + 1  #flipped for reverse strand
                    startLocInRefSeq = endLocInRefSeq + spacerLength - 1  #flipped for reverse strand
                else:
                    # startLocInRefSeq is 5' end of spacer on PAM-containing strand
                    # endLocInRefSeq is 3' end of spacer on PAM-containing strand
                    # Hence endLocInRefSeq <  startLocInRefSeq since this is reverse strand
                    startLocInRefSeq = startPos + spacerLength  # b/c spacer length is the offset between gbStringForSearch to RefSeq
                    endLocInRefSeq = startLocInRefSeq - spacerLength + 1

                startLocInRefGenome = chromStartRG + startLocInRefSeq - 1
                endLocInRefGenome = chromStartRG + endLocInRefSeq - 1
                cutSiteInRefGenome = startLocInRefGenome - distanceToCutSiteFrom5pEnd

                # Only add the spacer if it is a certain distance from the previous spacer
                if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing:
                    spacerNum += 1
                    strand = "-"
                    if spacerNum > 1:
                        distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq
                    else:
                        distFromPrevSpacer = 0
                    if PAMside == 3:  # Cas9-like
                        spacerRC = Seq(
                            str(s[endLocInRefSeq - 1:startLocInRefSeq]),
                            IUPAC.ambiguous_dna)
                        spacerAsStr = str(spacerRC.reverse_complement())
                        exactPAM = str(
                            Seq(
                                str(s[endLocInRefSeq -
                                      (PAM_length + 1):endLocInRefSeq - 1]),
                                IUPAC.ambiguous_dna).reverse_complement())
                    else:  # Cpf1-like
                        spacerRC = Seq(
                            str(s[endLocInRefSeq - 1:startLocInRefSeq]),
                            IUPAC.ambiguous_dna)
                        spacerAsStr = str(spacerRC.reverse_complement())
                        exactPAM = str(
                            Seq(
                                str(s[startLocInRefSeq:startLocInRefSeq +
                                      PAM_length]),
                                IUPAC.ambiguous_dna).reverse_complement())

                    GCcontent = SeqUtils.GC(spacerAsStr)
                    listItem = [
                        spacerNum, strand, startLocInRefSeq, endLocInRefSeq,
                        chromPos, startLocInRefGenome, endLocInRefGenome,
                        cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr,
                        exactPAM, GCcontent
                    ]

                    listSpacer.append(listItem)
                    listDistBetweenSpacers.append(float(distFromPrevSpacer))
                    prevStartLocInRefSeq = startLocInRefSeq

        print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(
            limit=cutoff_spacing, num=spacerNum)
        spacerNumTotal = spacerNumTotal + spacerNum

    arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers)
    meanDist = np.mean(arrDistBetweenSpacers)
    return (listSpacer, spacerNumTotal, meanDist)
Beispiel #22
0
    def clean(self):
        '''
        Clean data, adding an unsaved InchwormAssembly model ('assembly') and
        a list of stages ('stages') to self.cleaned_data
        '''
        cleaned_data = super(PathwayForm, self).clean()

        # Don't do anything if some fields are missing
        if not all(x in cleaned_data.keys() for x in
                   ['file', 'rbs_annotation_type', 'cds_annotation_type']):
            return cleaned_data

        def validate_contiguity(features):
            for i in range(len(features) - 1):
                if features[i].location.end != features[i+1].location.start:
                    raise forms.ValidationError(
                        'Features {} (of type {}) and {} (of type {}) must be contiguous.'.format(
                            features[i].qualifiers['label'][0],
                            features[i].type,
                            features[i+1].qualifiers['label'][0],
                            features[i+1].type,
                        ))

        record = SeqIO.read(cleaned_data['file'], 'genbank')
        feature_dict = {
            (feature.qualifiers['label'][0], feature.type): feature
            for feature in record.features
        }

        # Make sure all the required features are present
        pathway_features = []
        for stage_name in self.stage_names:
            rbs_key = (stage_name, cleaned_data['rbs_annotation_type'])
            cds_key = (stage_name, cleaned_data['cds_annotation_type'])
            try:
                pathway_features.append(feature_dict[rbs_key])
                pathway_features.append(feature_dict[cds_key])
            except KeyError as e:
                raise forms.ValidationError(
                    'Stage {} has no feature of type {}.'.format(*e.args[0]))

        # Make sure all the features are contiguous
        validate_contiguity(pathway_features)

        # Save all the annealable sequences
        annealable_seqs = []
        for i, stage_name in enumerate(self.stage_names):
            cds_feature = pathway_features[2*i + 1]

            annealable_seq = None

            for sequence_context in self.sequence_contexts:
                annealable_seq_name = '{} from {}'.format(
                    stage_name, sequence_context['name'])

                sequence_context['file'].seek(0)
                context_record = SeqIO.read(sequence_context['file'], 'genbank')

                search_result = SeqUtils.nt_search(
                    str(context_record.seq),
                    str(cds_feature.extract(record).seq),
                )

                if len(search_result) > 1:
                    annealable_seq = Gene(
                        file=sequence_context['file'],
                        start=search_result[1] + 1,
                        end=search_result[1] + len(cds_feature),
                        strand=1,
                        name=annealable_seq_name,
                        type=Gene.ANNEALABLE_SEQ,
                    )
                    annealable_seq.save()
                    break

                # No forward match found, so search the reverse strand
                rev_search_result = SeqUtils.nt_search(
                    str(context_record.seq),
                    str(cds_feature.extract(record).seq.reverse_complement()),
                )
                if len(rev_search_result) > 1:
                    annealable_seq = Gene(
                        file=sequence_context['file'],
                        start=rev_search_result[1] + 1,
                        end=rev_search_result[1] + len(cds_feature),
                        strand=-1,
                        name=annealable_seq_name,
                        type=Gene.ANNEALABLE_SEQ,
                    )
                    annealable_seq.save()
                    break

            if annealable_seq is None:
                # No sequence context matched, so do non-nested PCR directly off
                # the coding sequence
                seq_file = ContentFile('')
                annealable_seq = Gene(
                    file=seq_file,
                    start=1,
                    end=len(cds_feature),
                    strand=1,
                    name=stage_name,
                    type=Gene.ANNEALABLE_SEQ,
                )
                annealable_seq.save()
                seq_record = cds_feature.extract(record)
                seq_record.id = ''
                seq_record.name = ''
                SeqIO.write(seq_record, seq_file, 'genbank')
                annealable_seq.file.save(stage_name, seq_file)

            annealable_seqs.append(annealable_seq)

        # Save the genome
        if len(record[:pathway_features[0].location.start]) < self.fwd_ha_len:
            raise forms.ValidationError(
                '5’ genome context must be at least {} bp long.'.format(
                    self.fwd_ha_len))
        if len(record[pathway_features[-1].location.end:]) < self.rev_ha_len:
            raise forms.ValidationError(
                '3’ genome context must be at least {} bp long.'.format(
                    self.rev_ha_len))
        genome_record = record[:pathway_features[0].location.start] + \
                        record[pathway_features[-1].location.end:]
        genome_record.name = 'genome'
        genome_file = ContentFile('')
        genome = Gene(
            file=genome_file,
            start=pathway_features[0].location.start + 1,
            end=pathway_features[0].location.start,
            strand=1,
            name='Genome context',
        )
        genome.save()
        SeqIO.write(genome_record, genome_file, 'genbank')
        genome.file.save('genome', genome_file)

        # Save the stages
        cleaned_data['stages'] = []
        for i, stage_name in enumerate(self.stage_names):
            rbs_feature = pathway_features[2*i]
            stage = Stage(
                degeneracy=str(rbs_feature.extract(record).seq),
                annealable_seq = annealable_seqs[i],
                selection_cassette=self.selection_cassettes[i],
                name=stage_name,
            )
            cleaned_data['stages'].append(stage)
            stage.save()

        # Save the InchwormAssembly object
        cleaned_data['assembly'] = InchwormAssembly(
            genome=genome,
            enzyme=self.enzyme,
            library_size=self.library_size,
            dna_required=self.dna_required,
            fwd_ha_len=self.fwd_ha_len,
            rev_ha_len=self.rev_ha_len,
        )
        return cleaned_data
Beispiel #23
0
    def get_context_data(self, **kwargs):
        output = self.object.output
        primers = self.object.primers

        library_sizes = self.get_library_sizes()

        primer_names_by_sequence = dict()
        for name, sequence in primers:
            primer_names_by_sequence[sequence] = name
        def primer_name(primer):
            return primer_names_by_sequence[str(primer.full_seq().seq)]

        for i, stage_output in enumerate(output):
            stage_output['gg_primer_names'] = [
                (primer_name(primer1), primer_name(primer2))
                for primer1, primer2 in stage_output['gg'].primers
            ]
            stage_output['integration_primer_names'] = [
                primer_name(primer)
                for primer in stage_output['insert'].generate_primers()
            ]
            stage_output['phenotype'] = \
                self.object.stages.order_by('pk')[i].selection_cassette.phenotype
            if library_sizes:
                stage_output['dna_required'] = \
                    library_sizes[i] * self.object.dna_required

        # Compile unique Golden Gate PCR reactions for the tabular view
        gg_pcrs_by_primers_and_template = dict()
        gg_pcr_details = []
        for i, stage_output in enumerate(output):
            for j in range(3):
                primer_names = map(primer_name, stage_output['gg'].primers[j])
                primer_names_and_template = tuple(
                    primer_names + [str(
                        stage_output['gg'].genes[j].subrecord().seq.upper())])
                if primer_names_and_template in gg_pcrs_by_primers_and_template.keys():
                    continue
                else:
                    # Get length of PCR product
                    primer1 = stage_output['gg'].primers[j][0]
                    primer2 = stage_output['gg'].primers[j][1]
                    search_template = str(
                        stage_output['gg'].genes[j].subrecord().seq.upper())
                    forward_search_result = SeqUtils.nt_search(
                        search_template,
                        primer1.anneal_seq().upper(),
                    )
                    reverse_search_result = SeqUtils.nt_search(
                        search_template,
                        primer2.anneal_seq().reverse_complement().upper(),
                    )

                    assert len(forward_search_result) > 1 and \
                           len(reverse_search_result) > 1

                    # Get name of template
                    stage = self.get_object().stages.order_by('pk')[i]
                    if j == 0:
                        template_name = stage.annealable_seq.name
                    elif j == 1:
                        template_name = stage.selection_cassette.name
                    else:
                        template_name = 'Genome'

                    # Get primer Tm
                    forward_tm = recombineering.utils.Tm(
                        str(primer1.anneal_seq().seq))
                    reverse_tm = recombineering.utils.Tm(
                        str(primer2.anneal_seq().seq))

                    details = {
                        'product': 'gg{}-{}'.format(i+1, j+1),
                        'size': len(primer1.overhang) +
                                (reverse_search_result[1] -
                                 forward_search_result[1]) +
                                len(primer2.full_seq()),
                        'primer_names': primer_names_and_template,
                        'template': template_name,
                        'forward_tm': forward_tm,
                        'reverse_tm': reverse_tm,
                    }
                    gg_pcrs_by_primers_and_template[
                        primer_names_and_template] = details
                    gg_pcr_details.append(details)

        # Compile information about second-round PCRs
        round2_pcr_details = []
        for i, stage_output in enumerate(output):
            insert = stage_output['insert']
            insert_len = sum([
                insert.fwd_ha_len,
                len(insert.degeneracy),
                len(insert.sequence),
                insert.rev_ha_len,
            ])

            details = {
                'product': 'stage{}'.format(i+1),
                'size': insert_len,
                'primer_names': map(primer_name, insert.generate_primers()),
                'template': 'gg{}'.format(i+1),
                'forward_tm': recombineering.utils.Tm(
                    str(insert.generate_primers()[0].anneal_seq().seq)),
                'reverse_tm': recombineering.utils.Tm(
                    str(insert.generate_primers()[1].anneal_seq().seq)),
            }

            if library_sizes:
                details['dna_required'] = \
                    library_sizes[i] * self.object.dna_required

            round2_pcr_details.append(details)

        # Determine what goes into which Golden Gate reaction
        gg_details = []
        for i, stage_output in enumerate(output):
            fragments = []
            for j, (primer1, primer2) in enumerate(stage_output['gg'].primers):
                template = str(stage_output['gg'].genes[j].subrecord().seq.upper())
                primer_names_and_template = (
                    primer_name(primer1),
                    primer_name(primer2),
                    template,
                )
                fragments.append(
                    gg_pcrs_by_primers_and_template[primer_names_and_template]['product'])

            gg_details.append({
                'product': 'gg{}'.format(i+1),
                'size': len(stage_output['gg'].product),
                'fragments': fragments,
            })

        # Transformation details
        transformation_details = []
        for i in range(len(output)):
            stage = self.get_object().stages.order_by('pk')[i]

            transformation_details.append({
                'insert_name': round2_pcr_details[i]['product'],
                'phenotype': stage.selection_cassette.phenotype,
            })

        context = super(OutputView, self).get_context_data(**kwargs)
        context['output'] = output
        context['primers'] = primers

        context['gg_pcr_details'] = gg_pcr_details
        context['gg_details'] = gg_details
        context['round2_pcr_details'] = round2_pcr_details
        context['transformation_details'] = transformation_details

        return context
if fastafile == "test3prime.fasta":
    output_fh_name = "seqs_w_for_removed.fasta"

output_fh = open(output_fh_name, mode='w+')

output_text_name = "info_w_for_removed.txt"
if fastafile == "test3prime.fasta":
    output_text_name = "info_w_for_removed.txt"

output_text_fh = open(output_text_name, mode='w+')

for record in parsed:
    try:
        sequence = str(record.seq)
        search = SeqUtils.nt_search(sequence, adapter)  #This will search the
        index = int(
            search[1]
        )  #If it finds the adapter, is the starting index from which it was found.
        adapter_start = index
        adapter_end = index + len_adapter
        count_adapter_found += 1
        total_seq_count += 1
        if removeadapters == "True":  #if the value is true, it removes the adapters from the sequences.
            if end_defn == "5":
                record = record[
                    adapter_end:]  #If a 5' adapter, you remove adapter from beginning
            elif end_defn == "3":
                record = record[:
                                adapter_start]  #If it is a 3' adapter, you remove the adapter at the end
        elif removeadapters == "False":  #if the value is false, it does not remove the adapters from the sequences.
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio import motifs
from Bio import SeqUtils

with open("sites/MA0106.1.sites") as handle:
    p53 = motifs.read(handle, "sites")

motif = p53.degenerate_consensus

with open("motif_result_p53.txt", "w") as f:
    for seq_record in SeqIO.parse('gencode.v26.lncRNA_transcripts.fa',
                                  'fasta'):
        f.write(">" + str(seq_record.id) + "\n")
        result = SeqUtils.nt_search(str(seq_record), motif)
        f.write(str(result) + "\n")

##

with open("sites/MA0001.1.sites") as handle:
    AGL3 = motifs.read(handle, "sites")

motif = AGL3.degenerate_consensus

with open("motif_result_AGL3.txt", "w") as f:
    for seq_record in SeqIO.parse('gencode.v26.lncRNA_transcripts.fa',
                                  'fasta'):
        f.write(">" + str(seq_record.id) + "\n")
        result = SeqUtils.nt_search(str(seq_record), motif)
        f.write(str(result) + "\n")
if fastafile=="test3prime.fasta":
    output_fh_name="output2.fasta"

output_fh = open(output_fh_name, mode='w+')

output_text_name = "output.txt"
if fastafile=="test3prime.fasta":
    output_text_name="output2.txt"
output_text_fh = open(output_text_name, mode='w+')


for record in parsed:
    try:
        sequence = str(record.seq)
        search = SeqUtils.nt_search(sequence, adapter) #This will search the
        index = int(search[1]) #If it finds the adapter, is the starting index from which it was found.
        adapter_start = index
        adapter_end = index+len_adapter
        count_adapter_found +=1
        total_seq_count+=1
        if removeadapters == "True": #if the value is true, it removes the adapters from the sequences.
            if end_defn=="5":
                record = record[adapter_end:] #If a 5' adapter, you remove adapter from beginning
            elif end_defn=="3":
                record = record[:adapter_start] #If it is a 3' adapter, you remove the adapter at the end
        elif removeadapters == "False": #if the value is false, it does not remove the adapters from the sequences.
            record = record
        SeqIO.write(record, output_fh, format="fasta") #No matter what, write the reads.
    except IndexError:
        count_adapter_not_found+=1
Beispiel #27
0
def createdb():
	gis = [100753385, 100689306, 100751648]	
	accession = []
	description = []
	sequence = []
	
	request = Entrez.epost("nucleotide",id=",".join(map(str,gis)))
	result = Entrez.read(request)
	webEnv = result["WebEnv"]
	queryKey = result["QueryKey"]
	handle = Entrez.efetch(db="nucleotide",retmode="xml", webenv=webEnv, query_key=queryKey)
	for r in Entrez.parse(handle):
		# Grab the GI# 
		try:
			gi=int([x for x in r['GBSeq_other-seqids'] if "gi" in x][0].split("|")[1])
		except ValueError:
			gi=None
		fastaseq = ">GI ",gi," "+r["GBSeq_primary-accession"]+" "+r["GBSeq_definition"]+"\n"+r["GBSeq_sequence"][0:20]
		accession.append(''.join(fastaseq[0].strip() + str(fastaseq[1])))
		description.append(' '.join(fastaseq[2].split()[0:3]))
		sequence.append(fastaseq[2].split()[-1].upper())
	
	alt_map = {'ins':'0'}
	complement = {'A':'T','G':'C','T':'A','C':'G'}
	
	# getting the complementary sequence#
	def reverse_complement(seq):    
	    for k,v in alt_map.iteritems():
	        seq = seq.replace(k,v)
	    bases = list(seq) 
	    bases = reversed([complement.get(base,base) for base in bases])
	    bases = ''.join(bases)
	    for k,v in alt_map.iteritems():
	        bases = bases.replace(v,k)
	    return bases
	
	complementary_sequence = [reverse_complement(seq) for seq in sequence]
	
	
	#print sequence,complementary_sequence#
	
	#fetching the positions of 'GG' from the sequence
	exon = []
	comp_exon = []
	pattern = 'GG'
	for exons in sequence:
		
		exon_search = str(SeqUtils.nt_search(exons, pattern))
		exon.append(exon_search)
		
	for comp in complementary_sequence:
		
		comp_exon_search = str(SeqUtils.nt_search(comp, pattern))
		comp_exon.append(comp_exon_search)
	
	#print exon
	#print comp_exon
	
	conn = sqlite3.connect(sqlite_file)
	c = conn.cursor()
	
	c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'\
			.format (tn=table_name2, nf=new_field, ft=field_type))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=id_column, ct=column_type2))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=description_column, ct=column_type3))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=seq_column, ct=column_type4))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=comp_seq_column, ct=column_type5))       
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=PAM_column1, ct=column_type6))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=PAM_column2, ct=column_type7))       
	        
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (1, accession[0], description[0], sequence[0],complementary_sequence[0],exon[0],comp_exon[0]))
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (2, accession[1], description[1], sequence[1],complementary_sequence[0],exon[1],comp_exon[1]))
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (3, accession[2], description[1], sequence[2],complementary_sequence[0],exon[2],comp_exon[2]))
	conn.commit()
	conn.close()
Beispiel #28
0
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr, cutoff_spacing, referenceGenomeForDAS,spacerLength, distanceToCutSiteFromPAM_bp):
	from Bio import SeqFeature	

	if PAMside == 3:
		distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp
		# For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp
	else:
		distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp-1
		# For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp		
		
	s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS);	
	
	s=s.upper();
	PAM = Seq(seqStr, IUPAC.ambiguous_dna)
	PAM_length = len(seqStr);
	if seqStr == str(PAM.reverse_complement()):
		DoRevComp=0
		forwardNameString = "{name}_{num:0{width}}"
	else:
		DoRevComp=1
		forwardNameString = "{name}_F{num:0{width}}"
	listSpacer=[]
	listDistBetweenSpacers=[]
	
	spacerNum=0
	prevStartLocInRefSeq=-9999
	if PAMside == 3:
		gbStringForSearch = s[spacerLength:];	# Cas9
	else:
		gbStringForSearch = s[:-spacerLength];   # Cpf1, get all but last ~20 bases of sequence
				
	spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM))
	if len(spacerInds) > 1:	# matches found 
		del spacerInds[0] # first result from nt_search is regexp expansion
		#print "len line below {fname}".format(fname=len(spacerInds))
		formatDigitsN = int(math.ceil(math.log(len(spacerInds),10)));
		print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds)) 

		for idx, item in enumerate(spacerInds):
			startPos = SeqFeature.ExactPosition(item)	# start and end pos of PAM
			endPos = SeqFeature.ExactPosition(item+PAM_length)  	

			if PAMside == 3:		# Cas9-like
				startLocInRefSeq = startPos+1
				endLocInRefSeq = startLocInRefSeq+spacerLength-1
			else:					# Cpf1-like
				startLocInRefSeq = endPos  #Starts immediately after PAM
				endLocInRefSeq = startLocInRefSeq+spacerLength  

			startLocInRefGenome = chromStartRG+startLocInRefSeq
			endLocInRefGenome = chromStartRG+endLocInRefSeq-1
			cutSiteInRefGenome = startLocInRefGenome+distanceToCutSiteFrom5pEnd

			# Only add the spacer if it is a certain distance from the previous spacer
			if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: 
				spacerNum += 1
				strand="+"
				if spacerNum > 1:
					distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq
				else:
					distFromPrevSpacer = 0
				if PAMside == 3:
					spacerAsStr = str(s[startLocInRefSeq-1:endLocInRefSeq])
					exactPAM = s[endLocInRefSeq:endLocInRefSeq+PAM_length];
				else:
					spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq])
					exactPAM = s[startLocInRefSeq-PAM_length:startLocInRefSeq];  # Python slices: second index is first char you *DON'T* want

				GCcontent = SeqUtils.GC(spacerAsStr);
				listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, 
							cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent]				
				listSpacer.append(listItem)
				listDistBetweenSpacers.append(float(distFromPrevSpacer))
				prevStartLocInRefSeq=startLocInRefSeq
	
	
	print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) 
	spacerNumTotal=spacerNum
			
	# Search rev complement of PAM
	# print PAM
	# print PAM.reverse_complement()
	prevStartLocInRefSeq=-9999
	spacerNum=0
	if DoRevComp:
		if PAMside == 3:
			gbStringForSearch = s[:-spacerLength];   # get all but last ~20 bases of sequence
		else:
			gbStringForSearch = s[spacerLength:];
			
		spacerInds = SeqUtils.nt_search(gbStringForSearch,str(PAM.reverse_complement()))
		if len(spacerInds) > 1:	# matches found 
			del spacerInds[0] # first result from nt_search is regexp expansion
			#print "len line below {fname}".format(fname=len(spacerInds))
			formatDigitsN = int(math.ceil(math.log(len(spacerInds),10)));                                                                                                                                                                          
			print "Minus strand sgRNAs found: {num}".format(num=len(spacerInds))

			for idx, item in enumerate(spacerInds): 
				startPos = SeqFeature.ExactPosition(item) 
				endPos = SeqFeature.ExactPosition(item+PAM_length)   
				#print "Start pos: {num}  End pos: {num2}".format(num=startPos,num2=endPos)
				 			
				# Start and end locations are flipped here due to reverse strand
				if PAMside == 3:
					endLocInRefSeq = endPos+1  #flipped for reverse strand
					startLocInRefSeq = endLocInRefSeq+spacerLength-1  #flipped for reverse strand
				else:
					# startLocInRefSeq is 5' end of spacer on PAM-containing strand
					# endLocInRefSeq is 3' end of spacer on PAM-containing strand
					# Hence endLocInRefSeq <  startLocInRefSeq since this is reverse strand
					startLocInRefSeq = startPos + spacerLength # b/c spacer length is the offset between gbStringForSearch to RefSeq
					endLocInRefSeq = startLocInRefSeq - spacerLength +1

				startLocInRefGenome = chromStartRG+startLocInRefSeq-1
				endLocInRefGenome = chromStartRG+endLocInRefSeq-1
				cutSiteInRefGenome = startLocInRefGenome-distanceToCutSiteFrom5pEnd
												
				# Only add the spacer if it is a certain distance from the previous spacer
				if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: 
					spacerNum += 1
					strand="-"
					if spacerNum > 1:
						distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq
					else:
						distFromPrevSpacer = 0
					if PAMside == 3:# Cas9-like
						spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna)
						spacerAsStr = str(spacerRC.reverse_complement())
						exactPAM = str(Seq(str(s[endLocInRefSeq-(PAM_length+1):endLocInRefSeq-1]), IUPAC.ambiguous_dna).reverse_complement())
					else:	# Cpf1-like
						spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna)
						spacerAsStr = str(spacerRC.reverse_complement())
						exactPAM = str(Seq(str(s[startLocInRefSeq:startLocInRefSeq+PAM_length]), IUPAC.ambiguous_dna).reverse_complement())
						

					GCcontent = SeqUtils.GC(spacerAsStr);
					listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, 
								cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent]				

					listSpacer.append(listItem)
					listDistBetweenSpacers.append(float(distFromPrevSpacer))
					prevStartLocInRefSeq=startLocInRefSeq		

		print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) 
		spacerNumTotal=spacerNumTotal+spacerNum;
	
	arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers)
	meanDist = np.mean(arrDistBetweenSpacers)
	return (listSpacer, spacerNumTotal, meanDist)
Beispiel #29
0
    for randomRec in range(1,2):
        record = records[random.randint(1, len(records))]
        newRecord = SeqRecord(record.seq)

        #writing Header
        newRecord.seq.alphabet = generic_dna
        newRecord.id = record.id
        newRecord.name = record.name
        newRecord.description = record.description
        recordSeq = str(record.seq)

        for feature in featureStatistic_container:
            if feature not in ["PBS", "STF"]:
                for variation in featureStatistic_container[feature]:
                    featureSeq = str(variation.seq)
                    occurrence = SeqUtils.nt_search(recordSeq, featureSeq)
                    writeFeature(strand=1)

                    featureSeqComplement = str(variation.seq.complement())
                    occurrence = SeqUtils.nt_search(recordSeq, featureSeqComplement)
                    writeFeature(strand=-1)
            else:
                if(feature == "STF"):
                    writeSTF()

                if(feature == "PBS"):
                    writePBS()

        SeqIO.write(newRecord, output_handle, "genbank")