Esempio n. 1
0
def run_pam_finder(target_fa, seq, PAM, abs_start_pos, chr):

    # SeqUtils.nt_search("AGGCGGGGG", "NGG")
    # SeqUtils.nt_search("CCACCA", "NGG")
    # forward
    rev_seq = revcomp(target_fa)
    fwd_search = SeqUtils.nt_search(target_fa, seq + PAM)
    rev_search = SeqUtils.nt_search(rev_seq, seq + PAM)
    out = []
    if len(fwd_search) > 1:
        for s in fwd_search[1:]:
            # out.append([chr,s+abs_start_pos,s+abs_start_pos+len(seq),target_fa[s:(s+len(seq))],".","+"])
            out.append([
                chr, s + abs_start_pos, s + abs_start_pos + len(seq),
                target_fa[s:(s + len(seq))],
                target_fa[s:(s + len(seq) + len(PAM))], "+"
            ])
    if len(rev_search) > 1:
        for s in rev_search[1:]:
            # out.append([chr,(len(target_fa)-s)+abs_start_pos-len(seq),(len(target_fa)-s)+abs_start_pos,rev_seq[s:(s+len(seq))],".","-"])
            out.append([
                chr, (len(target_fa) - s) + abs_start_pos - len(seq),
                (len(target_fa) - s) + abs_start_pos,
                rev_seq[s:(s + len(seq))],
                rev_seq[s:(s + len(seq) + len(PAM))], "-"
            ])
    return pd.DataFrame(out)
Esempio n. 2
0
def count_amplicons(in_name, fprimer, rc):
    Fprimer = Seq(fprimer, IUPAC.ambiguous_dna)
    pre_length = Counter()
    if rc:
        post_length = Counter()
        bothfound = 0
        Rprimer = Seq(fprimer, IUPAC.ambiguous_dna).reverse_complement()
        lenRprimer = len(Rprimer)

    with open(in_name, 'r') as fastqF:
        for seqRecord in SeqIO.parse(fastqF, "fastq"):
            Fpos = SeqUtils.nt_search(str(seqRecord.seq), str(Fprimer))
            if len(Fpos) > 1:
                # SeqUtils.nt_search returns the pattern, followed by positions of any matches
                # Forward primer found: increment pre_length
                pre_length[Fpos[1]] += 1
            if rc:
                RCpos = SeqUtils.nt_search(str(seqRecord.seq), str(Rprimer))
                if len(RCpos) > 1:
                    tail = len(seqRecord) - RCpos[-1] - lenRprimer
                    post_length[tail] += 1
                    if len(Fpos) > 1:
                        bothfound += 1

    print("Primers found:", sum(pre_length.values()))
    print("Counts of pre_length:", pre_length)
    if rc:
        print("Reverse primers found:", sum(post_length.values()))
        print("Counts of post_length", post_length)
        print("Both primer and reverse_complement found:", bothfound)
Esempio n. 3
0
    def filtByPrimer(self, fwd_primer, rvs_primer):
        with open(self.input_forward) as fh:
            with open(self.input_reverse) as rh:

                count_keep = 0
                count_discard = 0

                for ((title_f, seq_f, qual_f),
                     (title_r, seq_r,
                      qual_r)) in zip(FastqGeneralIterator(fh),
                                      FastqGeneralIterator(rh)):

                    try:
                        if (SeqUtils.nt_search(seq_f, fwd_primer)[1] == 0) & (
                                SeqUtils.nt_search(seq_r, rvs_primer)[1] == 0):
                            with open(self.output_forward, 'a') as ofh:
                                ofh.write(
                                    '@' +
                                    '\n'.join([title_f, seq_f, '+', qual_f]) +
                                    '\n')
                            with open(self.output_reverse, 'a') as orh:
                                orh.write(
                                    '@' +
                                    '\n'.join([title_r, seq_r, '+', qual_r]) +
                                    '\n')
                            count_keep += 1
                        else:
                            count_discard += 1
                    except IndexError:
                        count_discard += 1
        print('     Number of reads saved: ' + str(count_keep))
        print('     Number of reads discard :' + str(count_discard))
Esempio n. 4
0
def search_motif(sequence):

    motif = str(args.pam)

    len_motif = int(len(motif))

    len_protospacer = int(args.length_protospacer)

    full_len = len_motif + len_protospacer

    len_dna = int(len(sequence.seq))

    # Output of nt_search is a list containing the motif and the start position (0-based)
    # of every hit in the DNA sequence

    # Search on fw strand
    matches_fw = SeqUtils.nt_search(str(sequence.seq), motif)

    # Initialyze final list
    coordinates_fw = []

    if len(matches_fw) > 1:
        end_positions_fw = matches_fw[1::]
        start_positions_fw = [
            end - len_protospacer for end in end_positions_fw
        ]

        # Check if protospacer fits in the sequence before adding the start
        # and end coordinate to the list
        for start, end in zip(start_positions_fw, end_positions_fw):
            if start > 0:
                coordinates_fw.append([start, end])

    # The coordinates are different and need to be corrected to match to fw strand
    reverse_seq = str(sequence.seq.reverse_complement())

    matches_rv = SeqUtils.nt_search(reverse_seq, motif)

    # Initialyze final list
    coordinates_rv = []

    if len(matches_rv) > 1:
        end_positions_rv = matches_rv[1::]
        start_positions_rv = [
            end - len_protospacer for end in end_positions_rv
        ]
        # Need to convert the coordinates in forward strand
        end_positions = [len_dna - start for start in start_positions_rv]
        start_positions = [len_dna - end for end in end_positions_rv]

        # Check if protospacer fits in the sequence before adding the start
        # and end coordinate to the list
        for start, end in zip(start_positions, end_positions):
            if start > 0 and end < len_dna:
                coordinates_rv.append([start, end])

    # Return a tuple of lists for fw and rv matches
    return coordinates_fw, coordinates_rv
Esempio n. 5
0
def writePBS():
    global variation, seqRecordToCheck, seqRecordToCheckComplement, difference, newFeature
    for variation in featureStatistic_container[feature]:
        primerSeq = str(variation.seq)
        primerName = variation.note

        partialPrimerSeq = primerSeq[len(primerSeq) - 15::]
        seqRecordToCheck = str(record.seq)
        seqRecordToCheckComplement = str(reverse_complement(record.seq))

        matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheck, partialPrimerSeq)

        if (len(matchingPrimerPositions) > 1):
            difference = len(primerSeq) - len(partialPrimerSeq)
            length = len(matchingPrimerPositions)
            for j in range(1, length):
                if primerSeq == seqRecordToCheck[matchingPrimerPositions[j] -
                        difference: matchingPrimerPositions[j] - difference + len(primerSeq)]:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j],
                                                            matchingPrimerPositions[j] + len(primerSeq),
                                                            strand=1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)

                else:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition(
                        matchingPrimerPositions[j] + len(primerSeq)), strand=1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)

        matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheckComplement, partialPrimerSeq)

        if (len(matchingPrimerPositions) > 1):
            difference = len(primerSeq) - len(partialPrimerSeq)
            length = len(matchingPrimerPositions)
            for j in range(1, length):
                if primerSeq == seqRecordToCheckComplement[matchingPrimerPositions[j] -
                        difference: matchingPrimerPositions[j] - difference + len(primerSeq)]:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j],
                                                            matchingPrimerPositions[j] + len(primerSeq),
                                                            strand=-1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)
                else:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition(
                        matchingPrimerPositions[j] + len(primerSeq)), strand=-1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)
Esempio n. 6
0
    def _find_iseq(self,
                   seq: Seq,
                   iseq_str: str,
                   iseq_id: str = "integrated sequence") -> int:
        """The Function to find index/location of iseq_str within the sequence.

        Args:
            seq: Sequence to search.
            iseq_str: The subsequence you are searching for.
            iseq_id (optional): The id/name of the subsequence
                (iseq_str), Defaults to "integrated sequence".

        Returns:
            int: The index/location of iseq within sequence.

        Raises:
            PartException: If iseq_str can not be found within the sequence,
                if multiple iseq_str exist within the sequence.
        """
        search_out = SeqUtils.nt_search(str(seq), iseq_str)
        if len(search_out) < 2:
            raise PartException(f"{self.id} lacks {iseq_id}")
        elif len(search_out) > 2:
            raise PartException(f"{self.id} contains multiple {iseq_id}")
        return search_out[1]
Esempio n. 7
0
def main():
    """Main application body"""
    # Genome sequence and annotations
    genome = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/fasta/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like_Genome.fasta')
    annotations = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/gff/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like.gff')

    # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013)
    motifs = load_motifs('najafabadi_table_s1_2013.csv')

    # Load genome sequence
    chromosomes = load_fasta(genome)

    # Parse annotations and return 3'UTR coordinates
    genes = get_utr_coords(annotations, utr_length=500)

    # For each gene, return a list of the motifs that are present in its 3'UTR
    for gene in genes:
        utr_seq = get_3utr_seq(chromosomes, gene)

        # check each motif to see if it is present
        utr3_motifs = []

        for motif in motifs:
            matches = SeqUtils.nt_search(utr_seq, motif)[1:]

            # save matched motif
            if len(matches) > 0:
                utr3_motifs.append(motif)

        # output results
        print("%s: %s" % (gene['id'], ", ".join(utr3_motifs)))
def parseSeqRecordForOligo(record,oligo):
    '''Parse SeqRecord for oligo and return True if found and False if not.'''
    results = SeqUtils.nt_search(str(record.seq),oligo) #search in SeqRecord sequence for oligo
    if (len(results) > 1):
        return True #if list > 1 item, a match position was found
    else: #print "Did NOT find %s in %s" % (ol.id, record.id)
        return False
Esempio n. 9
0
def find_PAM(seq, PAM):
    try:
        PAM_index = seq.index(PAM)
    except:
        # PAM on the left
        left_search = SeqUtils.nt_search(seq[:len(PAM)], PAM)
        if len(left_search) > 1:
            PAM_index = left_search[1]
        else:
            right_search = SeqUtils.nt_search(seq[-len(PAM):], PAM)
            if len(right_search) > 1:
                PAM_index = len(seq) - len(PAM)
            else:
                print("PAM: %s not found in %s. Set PAM index to 20" %
                      (PAM, seq))
                PAM_index = 20
    return PAM_index
Esempio n. 10
0
def is_dPAM(PAM_seq, RTT, cut_offset=-3):
    # Assuming no N is RTT, which should be true
    # match PAM seq to RTT, should be abs(cut_offset)
    # print (PAM_seq, RTT)
    # will need to do revcomp no matter what, because RTT is always xxxxxxxPAM

    seq = revcomp(RTT)
    fwd_search = SeqUtils.nt_search(seq, PAM_seq)
    flag = 1
    if len(fwd_search) > 1:
        if abs(cut_offset) in fwd_search:
            flag = 0

    return flag
def main():
    """Main application body"""
    # Parse command-line arguments
    args = parse_args()

    # Genome sequence and annotations
    genome = load_file(args.input_genome)
    annotations = load_file(args.input_annotations)

    # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013)
    motifs = load_motifs('najafabadi_table_s1_2013.csv')

    # Load genome sequence
    chromosomes = load_fasta(genome)

    # Parse annotations and return 3'UTR coordinates
    genes = get_utr_coords(annotations, utr_length=args.utr_length)

    # Create a list to store output rows
    output = []

    # For each gene, return a list of the motifs that are present in its 3'UTR
    num_genes = len(genes)

    for i, gene in enumerate(genes):
        utr_seq = get_3utr_seq(chromosomes, gene)

        print('Processing gene %d/%d' % (i + 1, num_genes))

        # check each motif to see if it is present
        utr3_motifs = []

        for motif in motifs:
            matches = SeqUtils.nt_search(utr_seq, motif)[1:]

            # save matched motif
            if len(matches) > 0:
                utr3_motifs.append(motif)

        output.append([gene['id']] + utr3_motifs)

    # output results
    with open(args.output, 'w') as output_file:
        writer = csv.writer(output_file)
        writer.writerows(output)
Esempio n. 12
0
def annotate_primer(primer_name, primer_seq, primer_direction, genome):
    if type(primer_seq) == SeqRecord:
        primer_seq = primer_seq.seq
    if primer_direction == -1:
        primer_seq = primer_seq.reverse_complement()

    primer_label = PRIMER_ANNOTATION_PREFIX + primer_name
    primer_genome_loc_start = SeqUtils.nt_search(
        str(genome.seq), str(primer_seq))[1]
    primer_genome_loc = FeatureLocation(
        primer_genome_loc_start,
        primer_genome_loc_start+len(primer_seq))
    primer_feature = SeqFeature(
        location=primer_genome_loc, type='misc_feature',
        strand=primer_direction,
        qualifiers={'label': [primer_label]})

    genome.features.append(primer_feature)
def digest(enzyme, sequence, outfile, count):
	# search input sequence using enzyme sequence and return results to 'matches'
	matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1])

	# for each of the items in results 'matches' list from 2nd item on (first item is match string)
	for match in matches[1:]:
		# create line for match on query stand
		line1 = sequence.id+"\t"+`int(match)+int(enzyme[2])`+"\t"+`int(match)+int(enzyme[2])`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t+\n"
		# look for reverse complement
		line2 = sequence.id+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t-\n"

		# if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted
		if len(enzyme[1])/2 < int(enzyme[2]):
			outfile.write(line2+line1)
		# if cut site is not past halfway point in enzyme, we can output in logical order
		else:
			# write both lines to ouput
			outfile.write(line1+line2)
		
		count += 1
	return count
def digest(enzyme, sequence, outfile, count):
	# search input sequence using enzyme sequence and return results to 'matches'
	matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1])

	# for each of the items in results 'matches' list from 2nd item on (first item is match string)
	for match in matches[1:]:
		# create line for match on query stand
		line1 = sequence.id+"\t"+str(int(match)+int(enzyme[2]))+"\t"+str(int(match)+int(enzyme[2]))+"\t"+enzyme[0]+"\tcut-"+str(count)+"\t+\n"
		# look for reverse complement
		line2 = sequence.id+"\t"+str(int(match)+int(len(enzyme[1])-int(enzyme[2])))+"\t"+str(int(match)+int(len(enzyme[1])-int(enzyme[2])))+"\t"+enzyme[0]+"\tcut-"+str(count)+"\t-\n"

		# if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted
		if len(enzyme[1])/2 < int(enzyme[2]):
			outfile.write(line2+line1)
		# if cut site is not past halfway point in enzyme, we can output in logical order
		else:
			# write both lines to ouput
			outfile.write(line1+line2)
		
		count += 1
	return count
def split_relabel(infile, gene, m, sample_IDs, output_handle):
    ## Set up logfile
    log = open("Primer_split_log.txt", "a")
    log.write("Reads file\tPrimer\tRead count\n")
    if m == "merged":
        label = re.split('Wx80_|_pear', infile)[1]
    elif m == "unmerged":
        label = re.split('Wx80_|_L001', infile)[1]
    reads_handle = open(infile)
    #reads_handle = gzip.open(fastq_file) # Use if files are gzipped
    well = re.split('_S', label)[0]  # NZGL well number
    sample_ID = sample_IDs.get(well)  # Corresponding sample ID
    count = 0
    primer_f = primerlist.get(gene)[0]
    primer_r = primerlist.get(gene)[1]
    trimmed = ""
    for record in SeqIO.parse(reads_handle, "fastq"):
        primer_search = SeqUtils.nt_search(str(
            record.seq), primer_f)  # Searches record.seq for primer
        if len(primer_search) > 1 and (
                primer_search
        )[1] == 0:  # Check if primer found (len > 1) at start of sequence ([1] == 0)
            if m == "merged":
                trimmed = record[len(primer_f):-len(primer_r)]
            elif m == "unmerged":
                trimmed = record[len(primer_f):]
            trimmed.id = (
                ("{0}|gene_{1}|sample_{2}").format(trimmed.id, gene, sample_ID)
            )  # Adds gene and sample ID
            print(trimmed.id)
            print(len(trimmed.seq))
            SeqIO.write(trimmed, output_handle, "fastq")
            count += 1
            print("{0} {1} {2} {3}".format(gene, label, sample_ID, count))
    print("{0} {1}: Saved {2} reads".format(label, gene, count))
    log.write("{0} {1} {2}: {3}".format(gene, label, sample_ID, count))
    reads_handle.close()
    #output_handle.close()
    log.close()
Esempio n. 16
0
                    help='Motif fasta file')
parser.add_argument('--search',
                    '-s',
                    required=True,
                    help='Sequence fasta file to search.')
parser.add_argument('--outfile',
                    '-o',
                    type=argparse.FileType('w', encoding='UTF-8'),
                    required=True,
                    help='Outfile (will be in bed format).')
args = parser.parse_args()

d = []
for seq_motif in SeqIO.parse(args.motif, "fasta"):
    for seq in SeqIO.parse(args.search, "fasta"):
        results = SeqUtils.nt_search(str(seq.seq), seq_motif.seq)
        results_rc = SeqUtils.nt_search(str(seq.seq),
                                        seq_motif.seq.reverse_complement())
        for i in results[1:]:
            d.append({
                'search': seq.id,
                'motif': results[0],
                'first': i + 1,
                'last': (i) + len(results[0])
            })
        for i in results_rc[1:]:
            d.append({
                'search': seq.id,
                'motif': results_rc[0],
                'first': i + 1,
                'last': (i) + len(results_rc[0])
 def map_locator_Spark(x, subsequence):
     return len(SeqUtils.nt_search(x[1], subsequence)) > 1
Esempio n. 18
0
from Bio import SeqIO	
from Bio.Alphabet import IUPAC		
from Bio.Seq import Seq
from Bio import motifs			
from Bio import SeqUtils


with open("sites/MA0106.1.sites") as handle:
     p53 = motifs.read(handle, "sites")

motif = p53.degenerate_consensus

with open("output/motif_result_p53.txt","w") as f:
	for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'):
		f.write(">" + str(seq_record.id) + "\n")
		result=SeqUtils.nt_search(str(seq_record), m)
		f.write(str(result) + "\n")

##

with open("sites/MA0001.1.sites") as handle:
     AGL3 = motifs.read(handle, "sites")

motif = AGL3.degenerate_consensus

with open("output/motif_result_AGL3.txt","w") as f:
	for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'):
		f.write(">" + str(seq_record.id) + "\n")
		result=SeqUtils.nt_search(str(seq_record), motif)
		f.write(str(result) + "\n")
Esempio n. 19
0
 def find_enzyme(input_seq):
     in_IUPAC = Seq(input_seq, alphabet=Bio.Alphabet.IUPAC.unambiguous_dna)
     for rec_seq in Restriction.data["IUPAC sequence"]:
         SeqUtils.nt_search(in_IUPAC, rec_seq)
     return
Esempio n. 20
0
from Bio import SeqUtils

consensus = "RGWYV"

sequence = "CGTAGCTAGCTCAGAGCAGGGACACGTGCTAGCAACAGCGCT"

SeqUtils.nt_search(sequence, consensus)
Esempio n. 21
0
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr,
                        cutoff_spacing, referenceGenomeForDAS, spacerLength,
                        distanceToCutSiteFromPAM_bp):
    from Bio import SeqFeature

    if PAMside == 3:
        distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp
        # For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp
    else:
        distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp - 1
        # For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp

    s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS)

    s = s.upper()
    PAM = Seq(seqStr, IUPAC.ambiguous_dna)
    PAM_length = len(seqStr)
    if seqStr == str(PAM.reverse_complement()):
        DoRevComp = 0
        forwardNameString = "{name}_{num:0{width}}"
    else:
        DoRevComp = 1
        forwardNameString = "{name}_F{num:0{width}}"
    listSpacer = []
    listDistBetweenSpacers = []

    spacerNum = 0
    prevStartLocInRefSeq = -9999
    if PAMside == 3:
        gbStringForSearch = s[spacerLength:]
        # Cas9
    else:
        gbStringForSearch = s[:-spacerLength]
        # Cpf1, get all but last ~20 bases of sequence

    spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM))
    if len(spacerInds) > 1:  # matches found
        del spacerInds[0]  # first result from nt_search is regexp expansion
        #print "len line below {fname}".format(fname=len(spacerInds))
        formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10)))
        print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds))

        for idx, item in enumerate(spacerInds):
            startPos = SeqFeature.ExactPosition(
                item)  # start and end pos of PAM
            endPos = SeqFeature.ExactPosition(item + PAM_length)

            if PAMside == 3:  # Cas9-like
                startLocInRefSeq = startPos + 1
                endLocInRefSeq = startLocInRefSeq + spacerLength - 1
            else:  # Cpf1-like
                startLocInRefSeq = endPos  #Starts immediately after PAM
                endLocInRefSeq = startLocInRefSeq + spacerLength

            startLocInRefGenome = chromStartRG + startLocInRefSeq
            endLocInRefGenome = chromStartRG + endLocInRefSeq - 1
            cutSiteInRefGenome = startLocInRefGenome + distanceToCutSiteFrom5pEnd

            # Only add the spacer if it is a certain distance from the previous spacer
            if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing:
                spacerNum += 1
                strand = "+"
                if spacerNum > 1:
                    distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq
                else:
                    distFromPrevSpacer = 0
                if PAMside == 3:
                    spacerAsStr = str(s[startLocInRefSeq - 1:endLocInRefSeq])
                    exactPAM = s[endLocInRefSeq:endLocInRefSeq + PAM_length]
                else:
                    spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq])
                    exactPAM = s[startLocInRefSeq -
                                 PAM_length:startLocInRefSeq]
                    # Python slices: second index is first char you *DON'T* want

                GCcontent = SeqUtils.GC(spacerAsStr)
                listItem = [
                    spacerNum, strand, startLocInRefSeq, endLocInRefSeq,
                    chromPos, startLocInRefGenome, endLocInRefGenome,
                    cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr,
                    exactPAM, GCcontent
                ]
                listSpacer.append(listItem)
                listDistBetweenSpacers.append(float(distFromPrevSpacer))
                prevStartLocInRefSeq = startLocInRefSeq

    print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(
        limit=cutoff_spacing, num=spacerNum)
    spacerNumTotal = spacerNum

    # Search rev complement of PAM
    # print PAM
    # print PAM.reverse_complement()
    prevStartLocInRefSeq = -9999
    spacerNum = 0
    if DoRevComp:
        if PAMside == 3:
            gbStringForSearch = s[:-spacerLength]
            # get all but last ~20 bases of sequence
        else:
            gbStringForSearch = s[spacerLength:]

        spacerInds = SeqUtils.nt_search(gbStringForSearch,
                                        str(PAM.reverse_complement()))
        if len(spacerInds) > 1:  # matches found
            del spacerInds[
                0]  # first result from nt_search is regexp expansion
            #print "len line below {fname}".format(fname=len(spacerInds))
            formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10)))
            print "Minus strand sgRNAs found: {num}".format(
                num=len(spacerInds))

            for idx, item in enumerate(spacerInds):
                startPos = SeqFeature.ExactPosition(item)
                endPos = SeqFeature.ExactPosition(item + PAM_length)
                #print "Start pos: {num}  End pos: {num2}".format(num=startPos,num2=endPos)

                # Start and end locations are flipped here due to reverse strand
                if PAMside == 3:
                    endLocInRefSeq = endPos + 1  #flipped for reverse strand
                    startLocInRefSeq = endLocInRefSeq + spacerLength - 1  #flipped for reverse strand
                else:
                    # startLocInRefSeq is 5' end of spacer on PAM-containing strand
                    # endLocInRefSeq is 3' end of spacer on PAM-containing strand
                    # Hence endLocInRefSeq <  startLocInRefSeq since this is reverse strand
                    startLocInRefSeq = startPos + spacerLength  # b/c spacer length is the offset between gbStringForSearch to RefSeq
                    endLocInRefSeq = startLocInRefSeq - spacerLength + 1

                startLocInRefGenome = chromStartRG + startLocInRefSeq - 1
                endLocInRefGenome = chromStartRG + endLocInRefSeq - 1
                cutSiteInRefGenome = startLocInRefGenome - distanceToCutSiteFrom5pEnd

                # Only add the spacer if it is a certain distance from the previous spacer
                if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing:
                    spacerNum += 1
                    strand = "-"
                    if spacerNum > 1:
                        distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq
                    else:
                        distFromPrevSpacer = 0
                    if PAMside == 3:  # Cas9-like
                        spacerRC = Seq(
                            str(s[endLocInRefSeq - 1:startLocInRefSeq]),
                            IUPAC.ambiguous_dna)
                        spacerAsStr = str(spacerRC.reverse_complement())
                        exactPAM = str(
                            Seq(
                                str(s[endLocInRefSeq -
                                      (PAM_length + 1):endLocInRefSeq - 1]),
                                IUPAC.ambiguous_dna).reverse_complement())
                    else:  # Cpf1-like
                        spacerRC = Seq(
                            str(s[endLocInRefSeq - 1:startLocInRefSeq]),
                            IUPAC.ambiguous_dna)
                        spacerAsStr = str(spacerRC.reverse_complement())
                        exactPAM = str(
                            Seq(
                                str(s[startLocInRefSeq:startLocInRefSeq +
                                      PAM_length]),
                                IUPAC.ambiguous_dna).reverse_complement())

                    GCcontent = SeqUtils.GC(spacerAsStr)
                    listItem = [
                        spacerNum, strand, startLocInRefSeq, endLocInRefSeq,
                        chromPos, startLocInRefGenome, endLocInRefGenome,
                        cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr,
                        exactPAM, GCcontent
                    ]

                    listSpacer.append(listItem)
                    listDistBetweenSpacers.append(float(distFromPrevSpacer))
                    prevStartLocInRefSeq = startLocInRefSeq

        print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(
            limit=cutoff_spacing, num=spacerNum)
        spacerNumTotal = spacerNumTotal + spacerNum

    arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers)
    meanDist = np.mean(arrDistBetweenSpacers)
    return (listSpacer, spacerNumTotal, meanDist)
Esempio n. 22
0
    def clean(self):
        '''
        Clean data, adding an unsaved InchwormAssembly model ('assembly') and
        a list of stages ('stages') to self.cleaned_data
        '''
        cleaned_data = super(PathwayForm, self).clean()

        # Don't do anything if some fields are missing
        if not all(x in cleaned_data.keys() for x in
                   ['file', 'rbs_annotation_type', 'cds_annotation_type']):
            return cleaned_data

        def validate_contiguity(features):
            for i in range(len(features) - 1):
                if features[i].location.end != features[i+1].location.start:
                    raise forms.ValidationError(
                        'Features {} (of type {}) and {} (of type {}) must be contiguous.'.format(
                            features[i].qualifiers['label'][0],
                            features[i].type,
                            features[i+1].qualifiers['label'][0],
                            features[i+1].type,
                        ))

        record = SeqIO.read(cleaned_data['file'], 'genbank')
        feature_dict = {
            (feature.qualifiers['label'][0], feature.type): feature
            for feature in record.features
        }

        # Make sure all the required features are present
        pathway_features = []
        for stage_name in self.stage_names:
            rbs_key = (stage_name, cleaned_data['rbs_annotation_type'])
            cds_key = (stage_name, cleaned_data['cds_annotation_type'])
            try:
                pathway_features.append(feature_dict[rbs_key])
                pathway_features.append(feature_dict[cds_key])
            except KeyError as e:
                raise forms.ValidationError(
                    'Stage {} has no feature of type {}.'.format(*e.args[0]))

        # Make sure all the features are contiguous
        validate_contiguity(pathway_features)

        # Save all the annealable sequences
        annealable_seqs = []
        for i, stage_name in enumerate(self.stage_names):
            cds_feature = pathway_features[2*i + 1]

            annealable_seq = None

            for sequence_context in self.sequence_contexts:
                annealable_seq_name = '{} from {}'.format(
                    stage_name, sequence_context['name'])

                sequence_context['file'].seek(0)
                context_record = SeqIO.read(sequence_context['file'], 'genbank')

                search_result = SeqUtils.nt_search(
                    str(context_record.seq),
                    str(cds_feature.extract(record).seq),
                )

                if len(search_result) > 1:
                    annealable_seq = Gene(
                        file=sequence_context['file'],
                        start=search_result[1] + 1,
                        end=search_result[1] + len(cds_feature),
                        strand=1,
                        name=annealable_seq_name,
                        type=Gene.ANNEALABLE_SEQ,
                    )
                    annealable_seq.save()
                    break

                # No forward match found, so search the reverse strand
                rev_search_result = SeqUtils.nt_search(
                    str(context_record.seq),
                    str(cds_feature.extract(record).seq.reverse_complement()),
                )
                if len(rev_search_result) > 1:
                    annealable_seq = Gene(
                        file=sequence_context['file'],
                        start=rev_search_result[1] + 1,
                        end=rev_search_result[1] + len(cds_feature),
                        strand=-1,
                        name=annealable_seq_name,
                        type=Gene.ANNEALABLE_SEQ,
                    )
                    annealable_seq.save()
                    break

            if annealable_seq is None:
                # No sequence context matched, so do non-nested PCR directly off
                # the coding sequence
                seq_file = ContentFile('')
                annealable_seq = Gene(
                    file=seq_file,
                    start=1,
                    end=len(cds_feature),
                    strand=1,
                    name=stage_name,
                    type=Gene.ANNEALABLE_SEQ,
                )
                annealable_seq.save()
                seq_record = cds_feature.extract(record)
                seq_record.id = ''
                seq_record.name = ''
                SeqIO.write(seq_record, seq_file, 'genbank')
                annealable_seq.file.save(stage_name, seq_file)

            annealable_seqs.append(annealable_seq)

        # Save the genome
        if len(record[:pathway_features[0].location.start]) < self.fwd_ha_len:
            raise forms.ValidationError(
                '5’ genome context must be at least {} bp long.'.format(
                    self.fwd_ha_len))
        if len(record[pathway_features[-1].location.end:]) < self.rev_ha_len:
            raise forms.ValidationError(
                '3’ genome context must be at least {} bp long.'.format(
                    self.rev_ha_len))
        genome_record = record[:pathway_features[0].location.start] + \
                        record[pathway_features[-1].location.end:]
        genome_record.name = 'genome'
        genome_file = ContentFile('')
        genome = Gene(
            file=genome_file,
            start=pathway_features[0].location.start + 1,
            end=pathway_features[0].location.start,
            strand=1,
            name='Genome context',
        )
        genome.save()
        SeqIO.write(genome_record, genome_file, 'genbank')
        genome.file.save('genome', genome_file)

        # Save the stages
        cleaned_data['stages'] = []
        for i, stage_name in enumerate(self.stage_names):
            rbs_feature = pathway_features[2*i]
            stage = Stage(
                degeneracy=str(rbs_feature.extract(record).seq),
                annealable_seq = annealable_seqs[i],
                selection_cassette=self.selection_cassettes[i],
                name=stage_name,
            )
            cleaned_data['stages'].append(stage)
            stage.save()

        # Save the InchwormAssembly object
        cleaned_data['assembly'] = InchwormAssembly(
            genome=genome,
            enzyme=self.enzyme,
            library_size=self.library_size,
            dna_required=self.dna_required,
            fwd_ha_len=self.fwd_ha_len,
            rev_ha_len=self.rev_ha_len,
        )
        return cleaned_data
Esempio n. 23
0
    def get_context_data(self, **kwargs):
        output = self.object.output
        primers = self.object.primers

        library_sizes = self.get_library_sizes()

        primer_names_by_sequence = dict()
        for name, sequence in primers:
            primer_names_by_sequence[sequence] = name
        def primer_name(primer):
            return primer_names_by_sequence[str(primer.full_seq().seq)]

        for i, stage_output in enumerate(output):
            stage_output['gg_primer_names'] = [
                (primer_name(primer1), primer_name(primer2))
                for primer1, primer2 in stage_output['gg'].primers
            ]
            stage_output['integration_primer_names'] = [
                primer_name(primer)
                for primer in stage_output['insert'].generate_primers()
            ]
            stage_output['phenotype'] = \
                self.object.stages.order_by('pk')[i].selection_cassette.phenotype
            if library_sizes:
                stage_output['dna_required'] = \
                    library_sizes[i] * self.object.dna_required

        # Compile unique Golden Gate PCR reactions for the tabular view
        gg_pcrs_by_primers_and_template = dict()
        gg_pcr_details = []
        for i, stage_output in enumerate(output):
            for j in range(3):
                primer_names = map(primer_name, stage_output['gg'].primers[j])
                primer_names_and_template = tuple(
                    primer_names + [str(
                        stage_output['gg'].genes[j].subrecord().seq.upper())])
                if primer_names_and_template in gg_pcrs_by_primers_and_template.keys():
                    continue
                else:
                    # Get length of PCR product
                    primer1 = stage_output['gg'].primers[j][0]
                    primer2 = stage_output['gg'].primers[j][1]
                    search_template = str(
                        stage_output['gg'].genes[j].subrecord().seq.upper())
                    forward_search_result = SeqUtils.nt_search(
                        search_template,
                        primer1.anneal_seq().upper(),
                    )
                    reverse_search_result = SeqUtils.nt_search(
                        search_template,
                        primer2.anneal_seq().reverse_complement().upper(),
                    )

                    assert len(forward_search_result) > 1 and \
                           len(reverse_search_result) > 1

                    # Get name of template
                    stage = self.get_object().stages.order_by('pk')[i]
                    if j == 0:
                        template_name = stage.annealable_seq.name
                    elif j == 1:
                        template_name = stage.selection_cassette.name
                    else:
                        template_name = 'Genome'

                    # Get primer Tm
                    forward_tm = recombineering.utils.Tm(
                        str(primer1.anneal_seq().seq))
                    reverse_tm = recombineering.utils.Tm(
                        str(primer2.anneal_seq().seq))

                    details = {
                        'product': 'gg{}-{}'.format(i+1, j+1),
                        'size': len(primer1.overhang) +
                                (reverse_search_result[1] -
                                 forward_search_result[1]) +
                                len(primer2.full_seq()),
                        'primer_names': primer_names_and_template,
                        'template': template_name,
                        'forward_tm': forward_tm,
                        'reverse_tm': reverse_tm,
                    }
                    gg_pcrs_by_primers_and_template[
                        primer_names_and_template] = details
                    gg_pcr_details.append(details)

        # Compile information about second-round PCRs
        round2_pcr_details = []
        for i, stage_output in enumerate(output):
            insert = stage_output['insert']
            insert_len = sum([
                insert.fwd_ha_len,
                len(insert.degeneracy),
                len(insert.sequence),
                insert.rev_ha_len,
            ])

            details = {
                'product': 'stage{}'.format(i+1),
                'size': insert_len,
                'primer_names': map(primer_name, insert.generate_primers()),
                'template': 'gg{}'.format(i+1),
                'forward_tm': recombineering.utils.Tm(
                    str(insert.generate_primers()[0].anneal_seq().seq)),
                'reverse_tm': recombineering.utils.Tm(
                    str(insert.generate_primers()[1].anneal_seq().seq)),
            }

            if library_sizes:
                details['dna_required'] = \
                    library_sizes[i] * self.object.dna_required

            round2_pcr_details.append(details)

        # Determine what goes into which Golden Gate reaction
        gg_details = []
        for i, stage_output in enumerate(output):
            fragments = []
            for j, (primer1, primer2) in enumerate(stage_output['gg'].primers):
                template = str(stage_output['gg'].genes[j].subrecord().seq.upper())
                primer_names_and_template = (
                    primer_name(primer1),
                    primer_name(primer2),
                    template,
                )
                fragments.append(
                    gg_pcrs_by_primers_and_template[primer_names_and_template]['product'])

            gg_details.append({
                'product': 'gg{}'.format(i+1),
                'size': len(stage_output['gg'].product),
                'fragments': fragments,
            })

        # Transformation details
        transformation_details = []
        for i in range(len(output)):
            stage = self.get_object().stages.order_by('pk')[i]

            transformation_details.append({
                'insert_name': round2_pcr_details[i]['product'],
                'phenotype': stage.selection_cassette.phenotype,
            })

        context = super(OutputView, self).get_context_data(**kwargs)
        context['output'] = output
        context['primers'] = primers

        context['gg_pcr_details'] = gg_pcr_details
        context['gg_details'] = gg_details
        context['round2_pcr_details'] = round2_pcr_details
        context['transformation_details'] = transformation_details

        return context
if fastafile == "test3prime.fasta":
    output_fh_name = "seqs_w_for_removed.fasta"

output_fh = open(output_fh_name, mode='w+')

output_text_name = "info_w_for_removed.txt"
if fastafile == "test3prime.fasta":
    output_text_name = "info_w_for_removed.txt"

output_text_fh = open(output_text_name, mode='w+')

for record in parsed:
    try:
        sequence = str(record.seq)
        search = SeqUtils.nt_search(sequence, adapter)  #This will search the
        index = int(
            search[1]
        )  #If it finds the adapter, is the starting index from which it was found.
        adapter_start = index
        adapter_end = index + len_adapter
        count_adapter_found += 1
        total_seq_count += 1
        if removeadapters == "True":  #if the value is true, it removes the adapters from the sequences.
            if end_defn == "5":
                record = record[
                    adapter_end:]  #If a 5' adapter, you remove adapter from beginning
            elif end_defn == "3":
                record = record[:
                                adapter_start]  #If it is a 3' adapter, you remove the adapter at the end
        elif removeadapters == "False":  #if the value is false, it does not remove the adapters from the sequences.
Esempio n. 25
0
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio import motifs
from Bio import SeqUtils

with open("sites/MA0106.1.sites") as handle:
    p53 = motifs.read(handle, "sites")

motif = p53.degenerate_consensus

with open("motif_result_p53.txt", "w") as f:
    for seq_record in SeqIO.parse('gencode.v26.lncRNA_transcripts.fa',
                                  'fasta'):
        f.write(">" + str(seq_record.id) + "\n")
        result = SeqUtils.nt_search(str(seq_record), motif)
        f.write(str(result) + "\n")

##

with open("sites/MA0001.1.sites") as handle:
    AGL3 = motifs.read(handle, "sites")

motif = AGL3.degenerate_consensus

with open("motif_result_AGL3.txt", "w") as f:
    for seq_record in SeqIO.parse('gencode.v26.lncRNA_transcripts.fa',
                                  'fasta'):
        f.write(">" + str(seq_record.id) + "\n")
        result = SeqUtils.nt_search(str(seq_record), motif)
        f.write(str(result) + "\n")
Esempio n. 26
0
if fastafile=="test3prime.fasta":
    output_fh_name="output2.fasta"

output_fh = open(output_fh_name, mode='w+')

output_text_name = "output.txt"
if fastafile=="test3prime.fasta":
    output_text_name="output2.txt"
output_text_fh = open(output_text_name, mode='w+')


for record in parsed:
    try:
        sequence = str(record.seq)
        search = SeqUtils.nt_search(sequence, adapter) #This will search the
        index = int(search[1]) #If it finds the adapter, is the starting index from which it was found.
        adapter_start = index
        adapter_end = index+len_adapter
        count_adapter_found +=1
        total_seq_count+=1
        if removeadapters == "True": #if the value is true, it removes the adapters from the sequences.
            if end_defn=="5":
                record = record[adapter_end:] #If a 5' adapter, you remove adapter from beginning
            elif end_defn=="3":
                record = record[:adapter_start] #If it is a 3' adapter, you remove the adapter at the end
        elif removeadapters == "False": #if the value is false, it does not remove the adapters from the sequences.
            record = record
        SeqIO.write(record, output_fh, format="fasta") #No matter what, write the reads.
    except IndexError:
        count_adapter_not_found+=1
Esempio n. 27
0
def createdb():
	gis = [100753385, 100689306, 100751648]	
	accession = []
	description = []
	sequence = []
	
	request = Entrez.epost("nucleotide",id=",".join(map(str,gis)))
	result = Entrez.read(request)
	webEnv = result["WebEnv"]
	queryKey = result["QueryKey"]
	handle = Entrez.efetch(db="nucleotide",retmode="xml", webenv=webEnv, query_key=queryKey)
	for r in Entrez.parse(handle):
		# Grab the GI# 
		try:
			gi=int([x for x in r['GBSeq_other-seqids'] if "gi" in x][0].split("|")[1])
		except ValueError:
			gi=None
		fastaseq = ">GI ",gi," "+r["GBSeq_primary-accession"]+" "+r["GBSeq_definition"]+"\n"+r["GBSeq_sequence"][0:20]
		accession.append(''.join(fastaseq[0].strip() + str(fastaseq[1])))
		description.append(' '.join(fastaseq[2].split()[0:3]))
		sequence.append(fastaseq[2].split()[-1].upper())
	
	alt_map = {'ins':'0'}
	complement = {'A':'T','G':'C','T':'A','C':'G'}
	
	# getting the complementary sequence#
	def reverse_complement(seq):    
	    for k,v in alt_map.iteritems():
	        seq = seq.replace(k,v)
	    bases = list(seq) 
	    bases = reversed([complement.get(base,base) for base in bases])
	    bases = ''.join(bases)
	    for k,v in alt_map.iteritems():
	        bases = bases.replace(v,k)
	    return bases
	
	complementary_sequence = [reverse_complement(seq) for seq in sequence]
	
	
	#print sequence,complementary_sequence#
	
	#fetching the positions of 'GG' from the sequence
	exon = []
	comp_exon = []
	pattern = 'GG'
	for exons in sequence:
		
		exon_search = str(SeqUtils.nt_search(exons, pattern))
		exon.append(exon_search)
		
	for comp in complementary_sequence:
		
		comp_exon_search = str(SeqUtils.nt_search(comp, pattern))
		comp_exon.append(comp_exon_search)
	
	#print exon
	#print comp_exon
	
	conn = sqlite3.connect(sqlite_file)
	c = conn.cursor()
	
	c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'\
			.format (tn=table_name2, nf=new_field, ft=field_type))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=id_column, ct=column_type2))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=description_column, ct=column_type3))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=seq_column, ct=column_type4))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=comp_seq_column, ct=column_type5))       
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=PAM_column1, ct=column_type6))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=PAM_column2, ct=column_type7))       
	        
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (1, accession[0], description[0], sequence[0],complementary_sequence[0],exon[0],comp_exon[0]))
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (2, accession[1], description[1], sequence[1],complementary_sequence[0],exon[1],comp_exon[1]))
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (3, accession[2], description[1], sequence[2],complementary_sequence[0],exon[2],comp_exon[2]))
	conn.commit()
	conn.close()
Esempio n. 28
0
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr, cutoff_spacing, referenceGenomeForDAS,spacerLength, distanceToCutSiteFromPAM_bp):
	from Bio import SeqFeature	

	if PAMside == 3:
		distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp
		# For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp
	else:
		distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp-1
		# For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp		
		
	s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS);	
	
	s=s.upper();
	PAM = Seq(seqStr, IUPAC.ambiguous_dna)
	PAM_length = len(seqStr);
	if seqStr == str(PAM.reverse_complement()):
		DoRevComp=0
		forwardNameString = "{name}_{num:0{width}}"
	else:
		DoRevComp=1
		forwardNameString = "{name}_F{num:0{width}}"
	listSpacer=[]
	listDistBetweenSpacers=[]
	
	spacerNum=0
	prevStartLocInRefSeq=-9999
	if PAMside == 3:
		gbStringForSearch = s[spacerLength:];	# Cas9
	else:
		gbStringForSearch = s[:-spacerLength];   # Cpf1, get all but last ~20 bases of sequence
				
	spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM))
	if len(spacerInds) > 1:	# matches found 
		del spacerInds[0] # first result from nt_search is regexp expansion
		#print "len line below {fname}".format(fname=len(spacerInds))
		formatDigitsN = int(math.ceil(math.log(len(spacerInds),10)));
		print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds)) 

		for idx, item in enumerate(spacerInds):
			startPos = SeqFeature.ExactPosition(item)	# start and end pos of PAM
			endPos = SeqFeature.ExactPosition(item+PAM_length)  	

			if PAMside == 3:		# Cas9-like
				startLocInRefSeq = startPos+1
				endLocInRefSeq = startLocInRefSeq+spacerLength-1
			else:					# Cpf1-like
				startLocInRefSeq = endPos  #Starts immediately after PAM
				endLocInRefSeq = startLocInRefSeq+spacerLength  

			startLocInRefGenome = chromStartRG+startLocInRefSeq
			endLocInRefGenome = chromStartRG+endLocInRefSeq-1
			cutSiteInRefGenome = startLocInRefGenome+distanceToCutSiteFrom5pEnd

			# Only add the spacer if it is a certain distance from the previous spacer
			if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: 
				spacerNum += 1
				strand="+"
				if spacerNum > 1:
					distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq
				else:
					distFromPrevSpacer = 0
				if PAMside == 3:
					spacerAsStr = str(s[startLocInRefSeq-1:endLocInRefSeq])
					exactPAM = s[endLocInRefSeq:endLocInRefSeq+PAM_length];
				else:
					spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq])
					exactPAM = s[startLocInRefSeq-PAM_length:startLocInRefSeq];  # Python slices: second index is first char you *DON'T* want

				GCcontent = SeqUtils.GC(spacerAsStr);
				listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, 
							cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent]				
				listSpacer.append(listItem)
				listDistBetweenSpacers.append(float(distFromPrevSpacer))
				prevStartLocInRefSeq=startLocInRefSeq
	
	
	print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) 
	spacerNumTotal=spacerNum
			
	# Search rev complement of PAM
	# print PAM
	# print PAM.reverse_complement()
	prevStartLocInRefSeq=-9999
	spacerNum=0
	if DoRevComp:
		if PAMside == 3:
			gbStringForSearch = s[:-spacerLength];   # get all but last ~20 bases of sequence
		else:
			gbStringForSearch = s[spacerLength:];
			
		spacerInds = SeqUtils.nt_search(gbStringForSearch,str(PAM.reverse_complement()))
		if len(spacerInds) > 1:	# matches found 
			del spacerInds[0] # first result from nt_search is regexp expansion
			#print "len line below {fname}".format(fname=len(spacerInds))
			formatDigitsN = int(math.ceil(math.log(len(spacerInds),10)));                                                                                                                                                                          
			print "Minus strand sgRNAs found: {num}".format(num=len(spacerInds))

			for idx, item in enumerate(spacerInds): 
				startPos = SeqFeature.ExactPosition(item) 
				endPos = SeqFeature.ExactPosition(item+PAM_length)   
				#print "Start pos: {num}  End pos: {num2}".format(num=startPos,num2=endPos)
				 			
				# Start and end locations are flipped here due to reverse strand
				if PAMside == 3:
					endLocInRefSeq = endPos+1  #flipped for reverse strand
					startLocInRefSeq = endLocInRefSeq+spacerLength-1  #flipped for reverse strand
				else:
					# startLocInRefSeq is 5' end of spacer on PAM-containing strand
					# endLocInRefSeq is 3' end of spacer on PAM-containing strand
					# Hence endLocInRefSeq <  startLocInRefSeq since this is reverse strand
					startLocInRefSeq = startPos + spacerLength # b/c spacer length is the offset between gbStringForSearch to RefSeq
					endLocInRefSeq = startLocInRefSeq - spacerLength +1

				startLocInRefGenome = chromStartRG+startLocInRefSeq-1
				endLocInRefGenome = chromStartRG+endLocInRefSeq-1
				cutSiteInRefGenome = startLocInRefGenome-distanceToCutSiteFrom5pEnd
												
				# Only add the spacer if it is a certain distance from the previous spacer
				if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: 
					spacerNum += 1
					strand="-"
					if spacerNum > 1:
						distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq
					else:
						distFromPrevSpacer = 0
					if PAMside == 3:# Cas9-like
						spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna)
						spacerAsStr = str(spacerRC.reverse_complement())
						exactPAM = str(Seq(str(s[endLocInRefSeq-(PAM_length+1):endLocInRefSeq-1]), IUPAC.ambiguous_dna).reverse_complement())
					else:	# Cpf1-like
						spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna)
						spacerAsStr = str(spacerRC.reverse_complement())
						exactPAM = str(Seq(str(s[startLocInRefSeq:startLocInRefSeq+PAM_length]), IUPAC.ambiguous_dna).reverse_complement())
						

					GCcontent = SeqUtils.GC(spacerAsStr);
					listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, 
								cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent]				

					listSpacer.append(listItem)
					listDistBetweenSpacers.append(float(distFromPrevSpacer))
					prevStartLocInRefSeq=startLocInRefSeq		

		print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) 
		spacerNumTotal=spacerNumTotal+spacerNum;
	
	arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers)
	meanDist = np.mean(arrDistBetweenSpacers)
	return (listSpacer, spacerNumTotal, meanDist)
Esempio n. 29
0
    for randomRec in range(1,2):
        record = records[random.randint(1, len(records))]
        newRecord = SeqRecord(record.seq)

        #writing Header
        newRecord.seq.alphabet = generic_dna
        newRecord.id = record.id
        newRecord.name = record.name
        newRecord.description = record.description
        recordSeq = str(record.seq)

        for feature in featureStatistic_container:
            if feature not in ["PBS", "STF"]:
                for variation in featureStatistic_container[feature]:
                    featureSeq = str(variation.seq)
                    occurrence = SeqUtils.nt_search(recordSeq, featureSeq)
                    writeFeature(strand=1)

                    featureSeqComplement = str(variation.seq.complement())
                    occurrence = SeqUtils.nt_search(recordSeq, featureSeqComplement)
                    writeFeature(strand=-1)
            else:
                if(feature == "STF"):
                    writeSTF()

                if(feature == "PBS"):
                    writePBS()

        SeqIO.write(newRecord, output_handle, "genbank")