예제 #1
0
def gene_feature(Y):
    """
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    """

    gene_names = Y["Target gene"]

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, "DNA")

    everything = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pd.DataFrame(
        data=everything,
        index=gene_names.index,
        columns=[
            "gene length",
            "gene GC content",
            "gene temperature",
            "gene molecular weight",
        ],
    )
    return df
예제 #2
0
def run_pam_finder(target_fa, seq, PAM, abs_start_pos, chr):

    # SeqUtils.nt_search("AGGCGGGGG", "NGG")
    # SeqUtils.nt_search("CCACCA", "NGG")
    # forward
    rev_seq = revcomp(target_fa)
    fwd_search = SeqUtils.nt_search(target_fa, seq + PAM)
    rev_search = SeqUtils.nt_search(rev_seq, seq + PAM)
    out = []
    if len(fwd_search) > 1:
        for s in fwd_search[1:]:
            # out.append([chr,s+abs_start_pos,s+abs_start_pos+len(seq),target_fa[s:(s+len(seq))],".","+"])
            out.append([
                chr, s + abs_start_pos, s + abs_start_pos + len(seq),
                target_fa[s:(s + len(seq))],
                target_fa[s:(s + len(seq) + len(PAM))], "+"
            ])
    if len(rev_search) > 1:
        for s in rev_search[1:]:
            # out.append([chr,(len(target_fa)-s)+abs_start_pos-len(seq),(len(target_fa)-s)+abs_start_pos,rev_seq[s:(s+len(seq))],".","-"])
            out.append([
                chr, (len(target_fa) - s) + abs_start_pos - len(seq),
                (len(target_fa) - s) + abs_start_pos,
                rev_seq[s:(s + len(seq))],
                rev_seq[s:(s + len(seq) + len(PAM))], "-"
            ])
    return pd.DataFrame(out)
예제 #3
0
def align(seq1, seq2, debug=False):
    flat1 = seq.seq1(''.join(seq1)).replace('X', '-')
    flat2 = seq.seq1(''.join(seq2)).replace('X', '-')
    flats = [flat1, flat2]
    # aligning 2 to 1 seems to give better results
    align = pairwise2.align.localxs(flat2,
                                    flat1,
                                    -1000,
                                    -1000,
                                    one_alignment_only=True)
    start = align[0][3]
    offset = [0, 0]
    # compute how many gaps had to be inserted at beginning to align
    for i in range(2):
        assert len(align[0][0]) == len(align[0][1])
        for j in range(len(align[0][0])):
            # account for the fact that 2 and 1 are switched in alignment results
            # if there is a gap in 1
            if align[0][(i + 1) % 2][j] == '-':
                # but not the other
                if flats[i][j - offset[i]] != '-':
                    offset[i] += 1
            else:
                break
    if debug:
        print(
            pairwise2.format_alignment(flat2[offset[0]:], flat1[offset[1]:],
                                       10, 0,
                                       len(flat1) - offset[1]))
    return -offset[0], -offset[1]
예제 #4
0
def create_res_df_from_bam(input_file, reference):
    species_list, chr_length_list, read_count_list, basecount_list, gc_ref_list, gc_reads_list = [], [], [], [], [], []

    for seq_record in SeqIO.parse(reference, 'fasta'):
        # joining all reads
        joined_reads = ''.join([
            read.query_sequence
            for read in pysam.AlignmentFile(input_file, 'rb').fetch(
                contig=seq_record.name)
        ])

        # appending to all Lists
        species_list.append(seq_record.name)
        chr_length_list.append(len(seq_record.seq))
        read_count_list.append(
            pysam.AlignmentFile(input_file,
                                'rb').count(contig=seq_record.name))
        gc_ref_list.append(SeqUtils.GC(seq_record.seq))
        gc_reads_list.append(SeqUtils.GC(joined_reads))
        basecount_list.append(sum([len(joined_reads)]))

    # create and return dataframe
    return pd.DataFrame(
        data={
            'species': species_list,
            'chr_length': chr_length_list,
            'gc_ref': gc_ref_list,
            'gc_reads': gc_reads_list,
            'read_count': read_count_list,
            'basecount': basecount_list,
        })
예제 #5
0
def count_amplicons(in_name, fprimer, rc):
    Fprimer = Seq(fprimer, IUPAC.ambiguous_dna)
    pre_length = Counter()
    if rc:
        post_length = Counter()
        bothfound = 0
        Rprimer = Seq(fprimer, IUPAC.ambiguous_dna).reverse_complement()
        lenRprimer = len(Rprimer)

    with open(in_name, 'r') as fastqF:
        for seqRecord in SeqIO.parse(fastqF, "fastq"):
            Fpos = SeqUtils.nt_search(str(seqRecord.seq), str(Fprimer))
            if len(Fpos) > 1:
                # SeqUtils.nt_search returns the pattern, followed by positions of any matches
                # Forward primer found: increment pre_length
                pre_length[Fpos[1]] += 1
            if rc:
                RCpos = SeqUtils.nt_search(str(seqRecord.seq), str(Rprimer))
                if len(RCpos) > 1:
                    tail = len(seqRecord) - RCpos[-1] - lenRprimer
                    post_length[tail] += 1
                    if len(Fpos) > 1:
                        bothfound += 1

    print("Primers found:", sum(pre_length.values()))
    print("Counts of pre_length:", pre_length)
    if rc:
        print("Reverse primers found:", sum(post_length.values()))
        print("Counts of post_length", post_length)
        print("Both primer and reverse_complement found:", bothfound)
예제 #6
0
def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_NN(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, 'DNA')

    all = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all,
                          index=gene_names.index,
                          columns=[
                              'gene length', 'gene GC content',
                              'gene temperature', 'gene molecular weight'
                          ])
    return df
예제 #7
0
    def filtByPrimer(self, fwd_primer, rvs_primer):
        with open(self.input_forward) as fh:
            with open(self.input_reverse) as rh:

                count_keep = 0
                count_discard = 0

                for ((title_f, seq_f, qual_f),
                     (title_r, seq_r,
                      qual_r)) in zip(FastqGeneralIterator(fh),
                                      FastqGeneralIterator(rh)):

                    try:
                        if (SeqUtils.nt_search(seq_f, fwd_primer)[1] == 0) & (
                                SeqUtils.nt_search(seq_r, rvs_primer)[1] == 0):
                            with open(self.output_forward, 'a') as ofh:
                                ofh.write(
                                    '@' +
                                    '\n'.join([title_f, seq_f, '+', qual_f]) +
                                    '\n')
                            with open(self.output_reverse, 'a') as orh:
                                orh.write(
                                    '@' +
                                    '\n'.join([title_r, seq_r, '+', qual_r]) +
                                    '\n')
                            count_keep += 1
                        else:
                            count_discard += 1
                    except IndexError:
                        count_discard += 1
        print('     Number of reads saved: ' + str(count_keep))
        print('     Number of reads discard :' + str(count_discard))
예제 #8
0
def search_motif(sequence):

    motif = str(args.pam)

    len_motif = int(len(motif))

    len_protospacer = int(args.length_protospacer)

    full_len = len_motif + len_protospacer

    len_dna = int(len(sequence.seq))

    # Output of nt_search is a list containing the motif and the start position (0-based)
    # of every hit in the DNA sequence

    # Search on fw strand
    matches_fw = SeqUtils.nt_search(str(sequence.seq), motif)

    # Initialyze final list
    coordinates_fw = []

    if len(matches_fw) > 1:
        end_positions_fw = matches_fw[1::]
        start_positions_fw = [
            end - len_protospacer for end in end_positions_fw
        ]

        # Check if protospacer fits in the sequence before adding the start
        # and end coordinate to the list
        for start, end in zip(start_positions_fw, end_positions_fw):
            if start > 0:
                coordinates_fw.append([start, end])

    # The coordinates are different and need to be corrected to match to fw strand
    reverse_seq = str(sequence.seq.reverse_complement())

    matches_rv = SeqUtils.nt_search(reverse_seq, motif)

    # Initialyze final list
    coordinates_rv = []

    if len(matches_rv) > 1:
        end_positions_rv = matches_rv[1::]
        start_positions_rv = [
            end - len_protospacer for end in end_positions_rv
        ]
        # Need to convert the coordinates in forward strand
        end_positions = [len_dna - start for start in start_positions_rv]
        start_positions = [len_dna - end for end in end_positions_rv]

        # Check if protospacer fits in the sequence before adding the start
        # and end coordinate to the list
        for start, end in zip(start_positions, end_positions):
            if start > 0 and end < len_dna:
                coordinates_rv.append([start, end])

    # Return a tuple of lists for fw and rv matches
    return coordinates_fw, coordinates_rv
예제 #9
0
def compute_stats(seq):
    stats = SeqStats
    stats.length = len(seq)
    stats.gc = SeqUtils.GC(seq)
    try:
        stats.weight = SeqUtils.molecular_weight(seq)
    except ValueError:
        stats.weight = None
    return stats
예제 #10
0
def target_genes_stats(
    genes=[
        'HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101'
    ]):
    for gene in genes:
        seq = get_gene_sequence(gene)
        if seq != None:
            print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (
                gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(
                    seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
예제 #11
0
 def var1_to_var3(var1):
     refAA1 = variant.get_refAA(var1)
     newAA1 = variant.get_newAA(var1)
     pos = variant.get_pos(var1)
     refAA3 = "*" if refAA1 == "*" else SeqUtils.seq3(refAA1)
     # refAA1 = SeqUtils.seq1(refAA3)
     newAA3 = "*" if newAA1 == "*" else SeqUtils.seq3(newAA1)
     # newAA1 = SeqUtils.seq1(newAA3)
     var3 = ''.join([refAA3, str(pos), newAA3])
     return var3
예제 #12
0
 def var3_to_var1(var3):
     refAA3 = variant.get_refAA(var3)
     newAA3 = variant.get_newAA(var3)
     pos = variant.get_pos(var3)
     refAA1 = "*" if refAA3 == "*" else SeqUtils.seq1(refAA3)
     # refAA1 = SeqUtils.seq1(refAA3)
     newAA1 = "*" if newAA3 == "*" else SeqUtils.seq1(newAA3)
     # newAA1 = SeqUtils.seq1(newAA3)
     var1 = ''.join([refAA1, str(pos), newAA1])
     return var1
예제 #13
0
def computeGCContent(seq_file=None, sequence=None):
	""" computes the GC-content of a given sequence or sequence file. Returns the GC-content in % """
	if (seq_file is None) == (sequence is None):
		raise Exception("Error in computeGCContent: Either seq_file or sequence must be specified")
	if seq_file is not None:
		gc_contents = [SeqUtils.GC123(s.seq)[0] for s in SeqIO.parse(seq_file, format="fasta")]				# use GC123 instead of GC to cope with dashes
		if len(gc_contents) > 1:
			logging.debug("gc_content is averaged over all sequences found in %s"%seq_file)
		return sum(gc_contents)/len(gc_contents)
	else:
		return SeqUtils.GC123(sequence)[0]
예제 #14
0
def get_distances(res_pairs, get_coords):
    ''' Get distances for all pairs of residues between two chains

        res_pairs: generator over tuples ((res_a, res_b), ...)
        get_coords: function to get residue coordinates

        Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...]

    '''

    return [(res_a.id[1], res_b.id[1],
             distances.calc_residue_distance(res_a, res_b, get_coords),
             SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname))
            for (res_a, res_b) in res_pairs]
예제 #15
0
def writePBS():
    global variation, seqRecordToCheck, seqRecordToCheckComplement, difference, newFeature
    for variation in featureStatistic_container[feature]:
        primerSeq = str(variation.seq)
        primerName = variation.note

        partialPrimerSeq = primerSeq[len(primerSeq) - 15::]
        seqRecordToCheck = str(record.seq)
        seqRecordToCheckComplement = str(reverse_complement(record.seq))

        matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheck, partialPrimerSeq)

        if (len(matchingPrimerPositions) > 1):
            difference = len(primerSeq) - len(partialPrimerSeq)
            length = len(matchingPrimerPositions)
            for j in range(1, length):
                if primerSeq == seqRecordToCheck[matchingPrimerPositions[j] -
                        difference: matchingPrimerPositions[j] - difference + len(primerSeq)]:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j],
                                                            matchingPrimerPositions[j] + len(primerSeq),
                                                            strand=1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)

                else:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition(
                        matchingPrimerPositions[j] + len(primerSeq)), strand=1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)

        matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheckComplement, partialPrimerSeq)

        if (len(matchingPrimerPositions) > 1):
            difference = len(primerSeq) - len(partialPrimerSeq)
            length = len(matchingPrimerPositions)
            for j in range(1, length):
                if primerSeq == seqRecordToCheckComplement[matchingPrimerPositions[j] -
                        difference: matchingPrimerPositions[j] - difference + len(primerSeq)]:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j],
                                                            matchingPrimerPositions[j] + len(primerSeq),
                                                            strand=-1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)
                else:
                    newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition(
                        matchingPrimerPositions[j] + len(primerSeq)), strand=-1), type=str(feature))
                    newFeature.qualifiers['note'] = primerName
                    newRecord.features.append(newFeature)
def get_gen_stats(gbk_list):
    # NOTE: for now, the coding density do not take overlapping genes
    # into account. Depending on how many of them are present in a genome,
    # this may cause an overestimation of the coding density, as each
    # CDS will be accounted for separately (and a same region will be counted
    # several times).

    hsh_gen_stats = {}

    for gbk_file in gbk_list:
        ttl_length = 0
        gc_cum = 0
        cds_length = 0
        for record in SeqIO.parse(gbk_file, "genbank"):
            ttl_length += len(record)
            gc_cum += SeqUtils.GC(record.seq) * len(record)
            for fet in record.features:
                if fet.type in ["CDS", "tmRNA", "rRNA", "ncRNA", "tRNA"]:
                    if "pseudo" in fet.qualifiers:
                        continue
                    location = fet.location

                    # allow to take compoundlocation into account
                    for part in location.parts:
                        cds_length += part.end - part.start
        gbk_shortened = gbk_file.replace(".gbk", "")
        hsh_gen_stats[gbk_shortened] = (float(gc_cum) / ttl_length,
                                        float(cds_length) / ttl_length,
                                        ttl_length)
    return hsh_gen_stats
예제 #17
0
    def _find_iseq(self,
                   seq: Seq,
                   iseq_str: str,
                   iseq_id: str = "integrated sequence") -> int:
        """The Function to find index/location of iseq_str within the sequence.

        Args:
            seq: Sequence to search.
            iseq_str: The subsequence you are searching for.
            iseq_id (optional): The id/name of the subsequence
                (iseq_str), Defaults to "integrated sequence".

        Returns:
            int: The index/location of iseq within sequence.

        Raises:
            PartException: If iseq_str can not be found within the sequence,
                if multiple iseq_str exist within the sequence.
        """
        search_out = SeqUtils.nt_search(str(seq), iseq_str)
        if len(search_out) < 2:
            raise PartException(f"{self.id} lacks {iseq_id}")
        elif len(search_out) > 2:
            raise PartException(f"{self.id} contains multiple {iseq_id}")
        return search_out[1]
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01):

    sequences = {}
    c = 0

    with open(input_file, "r") as content:

        for record in SeqIO.parse(content, "fasta"):
            c += 1

            # calculate GC content using Bio

            calc_gc = SeqUtils.GC(record.seq)

            # calculate aromaticity using Bio

            prot_seq = record.seq.translate()
            X = ProteinAnalysis(str(prot_seq))
            calc_arom = X.aromaticity()

            # so, now you can filter
            if calc_gc >= filt_gc and calc_arom >= filt_arom:
                sequences[record.id] = record.se

    # write a new fasta file with aminoacids
    records = []
    for seq_id, seq in sequences.items():
        records.append(SeqRecord(seq.translate(), id=seq_id, description=""))

    write_file = open('my_fasta', 'w')
    SeqIO.write(records, write_file, 'fasta')
    write_file.close()

    # print the percentage
    print(len(records) / c)
예제 #19
0
def main():
    """Main application body"""
    # Genome sequence and annotations
    genome = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/fasta/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like_Genome.fasta')
    annotations = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/gff/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like.gff')

    # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013)
    motifs = load_motifs('najafabadi_table_s1_2013.csv')

    # Load genome sequence
    chromosomes = load_fasta(genome)

    # Parse annotations and return 3'UTR coordinates
    genes = get_utr_coords(annotations, utr_length=500)

    # For each gene, return a list of the motifs that are present in its 3'UTR
    for gene in genes:
        utr_seq = get_3utr_seq(chromosomes, gene)

        # check each motif to see if it is present
        utr3_motifs = []

        for motif in motifs:
            matches = SeqUtils.nt_search(utr_seq, motif)[1:]

            # save matched motif
            if len(matches) > 0:
                utr3_motifs.append(motif)

        # output results
        print("%s: %s" % (gene['id'], ", ".join(utr3_motifs)))
예제 #20
0
파일: main.py 프로젝트: 0xrutvij/Archive
def gdps(str_list):
    ret_list = []
    i = 50
    for strng in str_list:
        ret_list.append((i, SeqUtils.GC(strng)))
        i = i + 100
    return ret_list
예제 #21
0
 def get33original_seq(self, aa_code_string):
     """."""
     if self.upper == 1:
         aa_code_string = aa_code_string.upper()
     code_original = ''
     len_sep = len(self.separator)
     i = 0
     while i < len(aa_code_string):
         aa_code = aa_code_string[i:i + 3]
         if aa_code == 3 * self.gap_char:
             aa_code_original = 3 * self.gap_char
         elif aa_code == self.unknown3:
             aa_code_original = self.unknown3
         else:
             if Raf.to_one_letter_code.has_key(aa_code):
                 aa_code_original = SeqUtils.seq3(
                     Raf.to_one_letter_code[aa_code])
                 if aa_code_original in self.blankseq3:
                     aa_code_original = self.unknown3
             else:
                 aa_code_original = self.unknown3
         code_original = code_original + aa_code_original
         i = i + 3
         if aa_code_string[i:i + len_sep] == self.separator:
             i = i + len_sep
             code_original = code_original + self.separator
     code_original = code_original.upper()
     return code_original
예제 #22
0
def generate_wide_table(all_fastas):
    global args, output_handle
    basis = [['A', 'T', 'G', 'C']] * args.kmer_length
    all_kmers = sorted(["".join(x) for x in tuple(itertools.product(*basis))])

    records_to_kmer = {}
    for f in all_fastas:
        logger.debug("Processing file %s" % (f))
        for record in SeqIO.parse(f, "fasta", generic_dna):
            logger.debug("Processing sequence %s" % (record.description))

            seq = str(record.seq)
            fasta_keys = [f, record.description, str(len(seq))]
            # Add additional features to the sequence
            fasta_keys.append(str(SeqUtils.GC(record.seq)))

            #
            fasta_keys = tuple(fasta_keys)
            records_to_kmer[fasta_keys] = collections.defaultdict(int)
            for i in range(0, len(seq) - args.kmer_length):
                kmer = seq[i:i + args.kmer_length]
                records_to_kmer[fasta_keys][kmer] += 1
    if not args.append:
        print >> output_handle, "\t".join(
            ["path", "sequence_description", "sequence_length", "GC"] +
            all_kmers)
    for k, kmer_values in records_to_kmer.items():
        all_values = list(k)
        all_values.extend(map(str, [kmer_values.get(x, 0) for x in all_kmers]))
        # print len(all_values)
        print >> output_handle, "\t".join(all_values)
예제 #23
0
def tbl_format(bed4_rrna, bed4_cds, bed4_trna):
    """
    tbl format :
    ---
    >refname # once
    ---
    for each term: 2line anntation
    start\tend\ttype\n\t\t\tkey\tvalue\n
    ---
    trna and rrna shows once,
    but cds show as gene and cds

    :param bed4_rrna:
    :param bed4_cds:
    :param bed4_trna:
    :return:
    """
    #sanity check
    if bed4_rrna[0][0]==bed4_cds[0][0]==bed4_trna[0][0]:
        ref=bed4_rrna[0][0]
    else:
        return "Error, annotations not from the same reference!"

    #
    type_dict={}
    for x in bed4_rrna:
        type_dict[x[3]]="rRNA"
    for x in bed4_trna:
        type_dict[x[3]]="tRNA"
    for x in bed4_cds:
        type_dict[x[3]]="CDS"

    bedall=sorted(bed4_rrna+bed4_cds+bed4_trna)

    out_l=[]

    for line in bedall:
        chro, start, end, anno=line
        if type_dict[anno]=="tRNA":

            seq3="tRNA-"+str(SeqUtils.seq3(anno))
            line2w="{start}\t{end}\t{type}\n\t\t\t{key}\t{value}\n".format(
                start=start,end=end, type="tRNA",key="product",value=seq3)

        elif type_dict[anno]=="rRNA":
            line2w="{start}\t{end}\t{type}\n\t\t\t{key}\t{value}\n".format(
                start=start,end=end, type="rRNA",key="product",value=anno)

        elif type_dict[anno]=="CDS":
            line2w_1="{start}\t{end}\t{type}\n\t\t\t{key}\t{value}\n".format(
                start=start,end=end, type="gene",key="gene",value=anno)
            line2w_2="{start}\t{end}\t{type}\n\t\t\t{key1}\t{value1}\n\t\t\t{key2}\t{value2}\n".format(
                start=start,end=end, type="CDS",
                key1="product",value1=anno,
                key2="transl_table",value2=5)
            line2w="".join([line2w_1, line2w_2])

        out_l.append(line2w)

    return out_l
예제 #24
0
def get_sequences(pdb_id, chain=None):
    '''Gets the sequences in a PDB file.'''
    return [SeqUtils.seq1(''.join([residue.get_resname()
                                   for residue in chn
                                   if 'CA' in residue.child_dict]))
            for chn in get_structure(pdb_id).get_chains()
            if chain is None or chain == chn.get_id()]
예제 #25
0
파일: summary.py 프로젝트: emmaver/IBP19-20
def plot(unmappeddict, unmap_stats, out):
    """ Generates boxplot, distribution plots and join plots from the missing regions summary statistics.
        They are saved in the output directory as jpg images.
    
    Parameters
    ----------
    unmappeddict: dict
        Dictionary of the coordinates and sequences of the unmapped regions
    unmap_stats: dataframe
        Table containing the unmapped regions summary statistics
    out: str
        Output directory
        
    """

    gc_content = list()
    regions_length = list()
    for key, values in unmappeddict.items():
        gc_content.append(SeqUtils.GC(values))
        regions_length.append(len(values))

    plt.figure(figsize=(10, 10))
    sns.set(style='white', font_scale=2)
    fig_joint = sns.jointplot(regions_length, gc_content, kind='hex', height=7)
    fig_joint.set_axis_labels(xlabel='Length', ylabel='GC Content')
    fig_joint.savefig(os.path.join(out, 'gc_length_joint_missing.jpg'))
    plt.clf()

    plt.figure(figsize=(15, 10))
    sns.set(style='white', font_scale=1.3)
    fig_gc = sns.distplot(gc_content, hist=True, rug=False, color='red')
    fig_gc.set(xlabel='GC Content')
    fig_gc.set_title('Distribution of GC Content')
    sns.despine()
    save = fig_gc.get_figure()
    save.savefig(os.path.join(out, 'gc_content_missing.jpg'))
    plt.clf()

    plt.figure(figsize=(15, 10))
    sns.set(style='white', font_scale=1.3)
    fig_length = sns.distplot(regions_length,
                              hist=True,
                              rug=False,
                              color='green')
    fig_length.set(xlabel='Length')
    fig_length.set_title('Distribution of Length')
    sns.despine()
    save = fig_length.get_figure()
    save.savefig(os.path.join(out, 'length_missing.jpg'))
    plt.clf()

    plt.figure(figsize=(15, 10))
    sns.set(style='white', font_scale=1.2)
    ax = sns.boxplot(data=unmap_stats.iloc[:, 3:24], palette='Spectral')
    ax.set_xlabel('Translated Codons')
    ax.set_ylabel('Mean Percentage per Frame (%)')
    sns.despine()
    save = ax.get_figure()
    save.savefig(os.path.join(out, 'codons_missing.jpg'))
    plt.clf()
예제 #26
0
def get_Seq_ORF_features(file_path,input_file,model):
    seq_id = []  
    features_dict = {}
    transcript_sequences = []
    for record in SeqIO.parse(input_file, "fasta"):
        name = record.id
        name = name.lower()
        seq_id.append(name)
        seq = record.seq
        transcript_sequences.append(seq)
        features_dict[name] = {}
        features_dict[name]["length"] = len(record.seq)   
        G_C = SeqUtils.GC(record.seq)
        features_dict[name]["G+C"] = G_C
        insta_fe,PI_fe,gra_fe = PP.param(seq)
        Len,Cov,inte_fe = leng.len_cov(seq)
        features_dict[name].update({"ORF-integrity":inte_fe,"ORF-coverage":Cov,"Instability":insta_fe,"PI":PI_fe,"Gravy":gra_fe})
        A,T,G,C,AT,AG,AC,TG,TC,GC,A0,A1,A2,A3,A4,T0,T1,T2,T3,T4,G0,G1,G2,G3,G4,C0,C1,C2,C3,C4 = CTD(seq)
        features_dict[name].update({'A':A,'T':T,'G':G,'C':C,'AT':AT,'AG':AG,'AC':AC,'TG':TG,'TC':TC,'GC':GC,'A0':A0,'A1':A1,'A2':A2,'A3':A3,'A4':A4,'T0':T0,'T1':T1,'T2':T2,'T3':T3,'T4':T4,'G0':G0,'G1':G1,'G2':G2,'G3':G3,'G4':G4,'C0':C0,'C1':C1,'C2':C2,'C3':C3,'C4':C4})
    os.system("python3 "+file_path+"/feamodule/cpat.py -g "+input_file+" -o temp_cpat.txt -x "+model_reference[model][1])  #Use cpat to get fickett , hexamer , ORF
    with open("temp_cpat.txt.dat", "r") as tabular:
        cpat_reader = csv.reader(tabular, delimiter=("\t"))
        for row in cpat_reader:
            name = row[0]
            name = name.lower() 
            ORF = float(row[2]) 
            fickett = float(row[3])
            hexamer = float(row[4])
            features_dict[name]["ORF"] = ORF  
            features_dict[name]["fickett"] = fickett 
            features_dict[name]["hexamer"] = hexamer  
    os.system("rm temp_cpat.txt.dat")
    return features_dict,seq_id,transcript_sequences
예제 #27
0
def main():
    args = fetch_args()
    utility.add_tmp_dir(args)
    utility.check_input(args)
    print("\n## Computing mean contig GC content")
    contigs = {}
    for id, seq in utility.parse_fasta(args["fna"]):
        contig = Contig()
        contig.id = id
        contig.seq = str(seq)
        contig.gc = round(SeqUtils.GC(seq), 2)
        contigs[id] = contig
    mean = np.mean([c.gc for c in contigs.values()])
    print("\n## Computing per-contig deviation from mean")
    for contig in contigs.values():
        contig.values = {}
        contig.values["delta"] = abs(contig.gc - mean)
    print("\n## Identifying outlier contigs")
    flagged = []
    for contig in contigs.values():
        if contig.values["delta"] > args["cutoff"]:
            flagged.append(contig.id)
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"   {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
예제 #28
0
def get_stats_from_contigs(contigs_fasta):
    """
    Use BioPython parser and GC calculator to get contig lengths and
    GCs from contigs fasta
    """

    # initialize lists
    contigs = []
    lengths = []
    gcs = []

    # loop over fasta records (this is 2-3 times faster than SeqIO.parse)
    # (and only marginally slower than my custom built parser.)
    with open(contigs_fasta, 'r') as CF:
        for title, sequence in SeqIO.FastaIO.SimpleFastaParser(CF):
            # parse title with RegEx
            contig = title.split(None, 1)[0]
            length = len(sequence)
            contigs.append(contig)
            lengths.append(length)
            gcs.append(SeqUtils.GC(sequence))

    # convert to DataFrame and return
    return pandas.DataFrame({'contig': contigs,
                             'length': lengths,
                             'GC': gcs}).set_index('contig')
def parseSeqRecordForOligo(record,oligo):
    '''Parse SeqRecord for oligo and return True if found and False if not.'''
    results = SeqUtils.nt_search(str(record.seq),oligo) #search in SeqRecord sequence for oligo
    if (len(results) > 1):
        return True #if list > 1 item, a match position was found
    else: #print "Did NOT find %s in %s" % (ol.id, record.id)
        return False
예제 #30
0
def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values==gene] = len(seq)
        gc_content[gene_names.values==gene] = SeqUtil.GC(seq)
        temperature[gene_names.values==gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values==gene] = SeqUtil.molecular_weight(seq, 'DNA')

    all = np.concatenate((gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all, index=gene_names.index, columns=['gene length',
                                                                     'gene GC content',
                                                                     'gene temperature',
                                                                     'gene molecular weight'])
    return df
def candidates_for_seq(seq, descriptor, GC_requirement=[0, 100]):
    candidates = []
    i = 0
    while i < len(seq):
        nextPAM = seq[i:].find(PAM_SEQ)
        if nextPAM == -1 or (i + nextPAM + len(PAM_SEQ) +
                             SPACER_LENGTH) > len(seq):
            i += 10000000
            break

        targetSeq = seq[i + nextPAM + len(PAM_SEQ):i + nextPAM + len(PAM_SEQ) +
                        SPACER_LENGTH]
        GC_content = SeqUtils.GC(targetSeq)
        if GC_content < GC_requirement[0] or GC_content > GC_requirement[1]:
            i += nextPAM + 1
            continue
        name = descriptor + str(i + nextPAM + len(PAM_SEQ))

        target = SeqRecord(targetSeq, id=name, name=name, description=name)
        candidate = {
            'name': target.id,
            'seqrec': target,
            'location': i + nextPAM + len(PAM_SEQ)
        }
        candidates.append(candidate)
        i += nextPAM + 1
    return candidates
예제 #32
0
 def __init__(self, file, fastaRecord):
     super(SequenceStat, self).__init__()
     self.file = file
     self.length = len(fastaRecord.seq)
     self.description = fastaRecord.description
     self.gc = SeqUtils.GC(fastaRecord.seq)
     self.crc32 = CheckSum.crc32(fastaRecord.seq)
예제 #33
0
def generate_long_table(all_fastas):
    global args, output_handle
    if not args.append:
        print >> output_handle, "\t".join([
            "path", "sequence_description", "sequence_length", "GC", "kmer",
            "count"
        ])
    for f in all_fastas:
        logger.debug("Processing file %s" % (f))
        for record in SeqIO.parse(f, "fasta", generic_dna):
            kmer_count = collections.defaultdict(int)

            logger.debug("Processing sequence %s" % (record.description))

            seq = str(record.seq)
            fasta_keys = [f, record.description, str(len(seq))]
            # Add additional features to the sequence
            fasta_keys.append(str(SeqUtils.GC(record.seq)))

            #
            # fasta_keys=tuple(fasta_keys)
            # kmer_count[fasta_keys]=collections.defaultdict(int)
            for i in range(0, len(seq) - args.kmer_length):
                kmer = seq[i:i + args.kmer_length]
                if args.star:
                    kmer = list(kmer)
                    for i in range(2, args.kmer_length, 3):
                        kmer[i] = "*"
                    kmer = "".join(kmer)

                kmer_count[kmer] += 1
            for kmer, count in kmer_count.items():
                print >> output_handle, "\t".join(fasta_keys +
                                                  [kmer, str(count)])
예제 #34
0
def extractPDBdata(structure, adjustChains, substitutionData, verbose):
	print('Extracting atoms details from PDB...')
	pdbData = {}
	for model in structure:
		for chain in model:
			chainID = chain.get_id()
			if chainID in adjustChains:
				pdbData[chainID] = {}
				residueID = 0
				for residue in chain:
					residueName = SeqUtils.seq1(residue.get_resname())
					if residueName != substitutionData[chainID][residueID][0]:
						continue
					(heteroFlag, sequenceID, insertionCode) = residue.get_id()
					if heteroFlag != ' ':
						continue
					value = substitutionData[chainID][residueID][1]
					if value != "-":
						pdbData[chainID][sequenceID] = value
					if verbose:
						print("Chain: " + chainID + "\t residue: " + residueName + " " + str(sequenceID) + "\t value: " + value)
					residueID += 1
					if (residueID >= len(substitutionData[chainID])):
						break
	print('OK')
	return pdbData
예제 #35
0
파일: trees.py 프로젝트: sarab609/scraps
def GC_content(fasta_file):
    sequences = SeqUtils.quick_FASTA_reader(fasta_file)
    GCs = [SeqUtils.GC(k[1]) for k in sequences]
##    for i in range(len(sequences)):
##        print str(GCs[i]) + '\t' + sequences[i][0]
    #print "AVERAGE: " + str(float(sum(GCs))/len(GCs))
    print str(float(sum(GCs))/len(GCs)) + '\t' + sequences[0][0]
예제 #36
0
def compute_stats(seq):
    stats = SeqStats
    stats.length = len(seq)
    stats.gc = SeqUtils.GC(seq)
    try:
        stats.weight = SeqUtils.molecular_weight(seq)
    except ValueError:
        stats.weight = None
    return stats
def get_distances(res_pairs, get_coords):
    ''' Get distances for all pairs of residues between two chains

        res_pairs: generator over tuples ((res_a, res_b), ...)
        get_coords: function to get residue coordinates

        Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...]

    '''

    return [
            (res_a.id[1], res_b.id[1],
             distances.calc_residue_distance(res_a, res_b, get_coords),
             SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname)
             )
            for (res_a, res_b)
            in res_pairs
            ]
예제 #38
0
def target_genes_stats(genes=["HPRT1", "TADA1", "NF2", "TADA2B", "NF1", "CUL3", "MED12", "CCDC101"]):
    for gene in genes:
        seq = get_gene_sequence(gene)
        if seq != None:
            print "%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f" % (
                gene,
                len(seq),
                SeqUtil.GC(seq),
                Tm.Tm_staluc(seq, rna=False),
                SeqUtil.molecular_weight(seq, "DNA"),
            )
예제 #39
0
def get_dihedral( residue_list ):

	'''
	returns phi and psi angles of a residue and the amino acid sidechain present

	residue_list - []Bio.PDB.Residue - list of 3 *hopefully* continuous residues

	'''

	for one, two in zip( residue_list[:-1], residue_list[1:] ):

		if ( two.get_id()[1] - one.get_id()[1] ) != 1:

			raise BackboneError( "Discontinuous residues", two.get_id()[1] )

	atoms = (
		{"C": False},
		{"N": False,
		"CA": False,
		"C": False},
		{"N": False}
	)

	for i, residue in enumerate( residue_list ):

		if i == 1:

			res_name = SeqUtils.seq1( residue.get_resname() )

			if not is_aa( res_name ):

				raise BackboneError( "Not a valid amino acid", residue.get_id()[1] )

		for atom in residue.get_unpacked_list():

			if atom.name in atoms[i].keys():
				
				atoms[i][ atom.name ] = atom.get_vector()

	if False in map( check_dict, atoms ):

		raise BackboneError( "Missing backbone atoms", residue.get_id()[1] )

	dihedrals = [
		PDB.calc_dihedral( atoms[0]["C"], atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"] ), #phi
		PDB.calc_dihedral( atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"], atoms[2]["N"] ) #psi
	]

	return ( dihedrals, res_name )
def main():
    """Main application body"""
    # Parse command-line arguments
    args = parse_args()

    # Genome sequence and annotations
    genome = load_file(args.input_genome)
    annotations = load_file(args.input_annotations)

    # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013)
    motifs = load_motifs('najafabadi_table_s1_2013.csv')

    # Load genome sequence
    chromosomes = load_fasta(genome)

    # Parse annotations and return 3'UTR coordinates
    genes = get_utr_coords(annotations, utr_length=args.utr_length)

    # Create a list to store output rows
    output = []

    # For each gene, return a list of the motifs that are present in its 3'UTR
    num_genes = len(genes)

    for i, gene in enumerate(genes):
        utr_seq = get_3utr_seq(chromosomes, gene)

        print('Processing gene %d/%d' % (i + 1, num_genes))

        # check each motif to see if it is present
        utr3_motifs = []

        for motif in motifs:
            matches = SeqUtils.nt_search(utr_seq, motif)[1:]

            # save matched motif
            if len(matches) > 0:
                utr3_motifs.append(motif)

        output.append([gene['id']] + utr3_motifs)

    # output results
    with open(args.output, 'w') as output_file:
        writer = csv.writer(output_file)
        writer.writerows(output)
예제 #41
0
def SeqUtilFeatures(data):
    '''
    assuming '30-mer'is a key
    get melting temperature features from:
        0-the 30-mer ("global Tm")
        1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM
        2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt)
        3-the Tm of the DNA:RNA hybrid from position 3 - 7  (i.e. 5 nt)
    '''
    sequence = data['30mer'].values
    num_features = 1
    featarray = np.ones((sequence.shape[0], num_features))
    for i, seq in enumerate(sequence):
        assert len(seq) == 30, "seems to assume 30mer"
        featarray[i, 0] = SeqUtil.molecular_weight(str(seq))

    feat = pandas.DataFrame(pandas.DataFrame(featarray))
    return feat
예제 #42
0
파일: utils.py 프로젝트: uycire/goldenworm
def annotate_primer(primer_name, primer_seq, primer_direction, genome):
    if type(primer_seq) == SeqRecord:
        primer_seq = primer_seq.seq
    if primer_direction == -1:
        primer_seq = primer_seq.reverse_complement()

    primer_label = PRIMER_ANNOTATION_PREFIX + primer_name
    primer_genome_loc_start = SeqUtils.nt_search(
        str(genome.seq), str(primer_seq))[1]
    primer_genome_loc = FeatureLocation(
        primer_genome_loc_start,
        primer_genome_loc_start+len(primer_seq))
    primer_feature = SeqFeature(
        location=primer_genome_loc, type='misc_feature',
        strand=primer_direction,
        qualifiers={'label': [primer_label]})

    genome.features.append(primer_feature)
예제 #43
0
def digest(enzyme, sequence, outfile, count):
	# search input sequence using enzyme sequence and return results to 'matches'
	matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1])

	# for each of the items in results 'matches' list from 2nd item on (first item is match string)
	for match in matches[1:]:
		# create line for match on query stand
		line1 = sequence.id+"\t"+`int(match)+int(enzyme[2])`+"\t"+`int(match)+int(enzyme[2])`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t+\n"
		# look for reverse complement
		line2 = sequence.id+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t-\n"

		# if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted
		if len(enzyme[1])/2 < int(enzyme[2]):
			outfile.write(line2+line1)
		# if cut site is not past halfway point in enzyme, we can output in logical order
		else:
			# write both lines to ouput
			outfile.write(line1+line2)
		
		count += 1
	return count
예제 #44
0
def Chain_to_SeqRecord(chain):
    ''' Generates a SeqRecord from a Chain entity.

        chain: a Bio.PDB.Chain object

        Keeps only residues with blank flags (eg. no HET residues).

        Returns seqr: a Bio.SeqRecord object with a list of resnums saved in
        its letter_annotations['resnum'].

    '''

    aas = ''
    resns = list()
    for res in get_nonhet_residues(chain):
        aas += SeqUtils.seq1(res.get_resname())  # get 1-letter resname
        resns += [res.id[1]]

    seqr = SeqRecord.SeqRecord(Seq.Seq(aas), id = chain.id,
                               letter_annotations = {"resnum": resns})

    return seqr
예제 #45
0
def createdb():
	gis = [100753385, 100689306, 100751648]	
	accession = []
	description = []
	sequence = []
	
	request = Entrez.epost("nucleotide",id=",".join(map(str,gis)))
	result = Entrez.read(request)
	webEnv = result["WebEnv"]
	queryKey = result["QueryKey"]
	handle = Entrez.efetch(db="nucleotide",retmode="xml", webenv=webEnv, query_key=queryKey)
	for r in Entrez.parse(handle):
		# Grab the GI# 
		try:
			gi=int([x for x in r['GBSeq_other-seqids'] if "gi" in x][0].split("|")[1])
		except ValueError:
			gi=None
		fastaseq = ">GI ",gi," "+r["GBSeq_primary-accession"]+" "+r["GBSeq_definition"]+"\n"+r["GBSeq_sequence"][0:20]
		accession.append(''.join(fastaseq[0].strip() + str(fastaseq[1])))
		description.append(' '.join(fastaseq[2].split()[0:3]))
		sequence.append(fastaseq[2].split()[-1].upper())
	
	alt_map = {'ins':'0'}
	complement = {'A':'T','G':'C','T':'A','C':'G'}
	
	# getting the complementary sequence#
	def reverse_complement(seq):    
	    for k,v in alt_map.iteritems():
	        seq = seq.replace(k,v)
	    bases = list(seq) 
	    bases = reversed([complement.get(base,base) for base in bases])
	    bases = ''.join(bases)
	    for k,v in alt_map.iteritems():
	        bases = bases.replace(v,k)
	    return bases
	
	complementary_sequence = [reverse_complement(seq) for seq in sequence]
	
	
	#print sequence,complementary_sequence#
	
	#fetching the positions of 'GG' from the sequence
	exon = []
	comp_exon = []
	pattern = 'GG'
	for exons in sequence:
		
		exon_search = str(SeqUtils.nt_search(exons, pattern))
		exon.append(exon_search)
		
	for comp in complementary_sequence:
		
		comp_exon_search = str(SeqUtils.nt_search(comp, pattern))
		comp_exon.append(comp_exon_search)
	
	#print exon
	#print comp_exon
	
	conn = sqlite3.connect(sqlite_file)
	c = conn.cursor()
	
	c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'\
			.format (tn=table_name2, nf=new_field, ft=field_type))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=id_column, ct=column_type2))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=description_column, ct=column_type3))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=seq_column, ct=column_type4))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=comp_seq_column, ct=column_type5))       
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=PAM_column1, ct=column_type6))
	c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\
	        .format(tn=table_name2, cn=PAM_column2, ct=column_type7))       
	        
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (1, accession[0], description[0], sequence[0],complementary_sequence[0],exon[0],comp_exon[0]))
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (2, accession[1], description[1], sequence[1],complementary_sequence[0],exon[1],comp_exon[1]))
	c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (3, accession[2], description[1], sequence[2],complementary_sequence[0],exon[2],comp_exon[2]))
	conn.commit()
	conn.close()
예제 #46
0
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr, cutoff_spacing, referenceGenomeForDAS,spacerLength, distanceToCutSiteFromPAM_bp):
	from Bio import SeqFeature	

	if PAMside == 3:
		distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp
		# For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp
	else:
		distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp-1
		# For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp		
		
	s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS);	
	
	s=s.upper();
	PAM = Seq(seqStr, IUPAC.ambiguous_dna)
	PAM_length = len(seqStr);
	if seqStr == str(PAM.reverse_complement()):
		DoRevComp=0
		forwardNameString = "{name}_{num:0{width}}"
	else:
		DoRevComp=1
		forwardNameString = "{name}_F{num:0{width}}"
	listSpacer=[]
	listDistBetweenSpacers=[]
	
	spacerNum=0
	prevStartLocInRefSeq=-9999
	if PAMside == 3:
		gbStringForSearch = s[spacerLength:];	# Cas9
	else:
		gbStringForSearch = s[:-spacerLength];   # Cpf1, get all but last ~20 bases of sequence
				
	spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM))
	if len(spacerInds) > 1:	# matches found 
		del spacerInds[0] # first result from nt_search is regexp expansion
		#print "len line below {fname}".format(fname=len(spacerInds))
		formatDigitsN = int(math.ceil(math.log(len(spacerInds),10)));
		print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds)) 

		for idx, item in enumerate(spacerInds):
			startPos = SeqFeature.ExactPosition(item)	# start and end pos of PAM
			endPos = SeqFeature.ExactPosition(item+PAM_length)  	

			if PAMside == 3:		# Cas9-like
				startLocInRefSeq = startPos+1
				endLocInRefSeq = startLocInRefSeq+spacerLength-1
			else:					# Cpf1-like
				startLocInRefSeq = endPos  #Starts immediately after PAM
				endLocInRefSeq = startLocInRefSeq+spacerLength  

			startLocInRefGenome = chromStartRG+startLocInRefSeq
			endLocInRefGenome = chromStartRG+endLocInRefSeq-1
			cutSiteInRefGenome = startLocInRefGenome+distanceToCutSiteFrom5pEnd

			# Only add the spacer if it is a certain distance from the previous spacer
			if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: 
				spacerNum += 1
				strand="+"
				if spacerNum > 1:
					distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq
				else:
					distFromPrevSpacer = 0
				if PAMside == 3:
					spacerAsStr = str(s[startLocInRefSeq-1:endLocInRefSeq])
					exactPAM = s[endLocInRefSeq:endLocInRefSeq+PAM_length];
				else:
					spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq])
					exactPAM = s[startLocInRefSeq-PAM_length:startLocInRefSeq];  # Python slices: second index is first char you *DON'T* want

				GCcontent = SeqUtils.GC(spacerAsStr);
				listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, 
							cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent]				
				listSpacer.append(listItem)
				listDistBetweenSpacers.append(float(distFromPrevSpacer))
				prevStartLocInRefSeq=startLocInRefSeq
	
	
	print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) 
	spacerNumTotal=spacerNum
			
	# Search rev complement of PAM
	# print PAM
	# print PAM.reverse_complement()
	prevStartLocInRefSeq=-9999
	spacerNum=0
	if DoRevComp:
		if PAMside == 3:
			gbStringForSearch = s[:-spacerLength];   # get all but last ~20 bases of sequence
		else:
			gbStringForSearch = s[spacerLength:];
			
		spacerInds = SeqUtils.nt_search(gbStringForSearch,str(PAM.reverse_complement()))
		if len(spacerInds) > 1:	# matches found 
			del spacerInds[0] # first result from nt_search is regexp expansion
			#print "len line below {fname}".format(fname=len(spacerInds))
			formatDigitsN = int(math.ceil(math.log(len(spacerInds),10)));                                                                                                                                                                          
			print "Minus strand sgRNAs found: {num}".format(num=len(spacerInds))

			for idx, item in enumerate(spacerInds): 
				startPos = SeqFeature.ExactPosition(item) 
				endPos = SeqFeature.ExactPosition(item+PAM_length)   
				#print "Start pos: {num}  End pos: {num2}".format(num=startPos,num2=endPos)
				 			
				# Start and end locations are flipped here due to reverse strand
				if PAMside == 3:
					endLocInRefSeq = endPos+1  #flipped for reverse strand
					startLocInRefSeq = endLocInRefSeq+spacerLength-1  #flipped for reverse strand
				else:
					# startLocInRefSeq is 5' end of spacer on PAM-containing strand
					# endLocInRefSeq is 3' end of spacer on PAM-containing strand
					# Hence endLocInRefSeq <  startLocInRefSeq since this is reverse strand
					startLocInRefSeq = startPos + spacerLength # b/c spacer length is the offset between gbStringForSearch to RefSeq
					endLocInRefSeq = startLocInRefSeq - spacerLength +1

				startLocInRefGenome = chromStartRG+startLocInRefSeq-1
				endLocInRefGenome = chromStartRG+endLocInRefSeq-1
				cutSiteInRefGenome = startLocInRefGenome-distanceToCutSiteFrom5pEnd
												
				# Only add the spacer if it is a certain distance from the previous spacer
				if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: 
					spacerNum += 1
					strand="-"
					if spacerNum > 1:
						distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq
					else:
						distFromPrevSpacer = 0
					if PAMside == 3:# Cas9-like
						spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna)
						spacerAsStr = str(spacerRC.reverse_complement())
						exactPAM = str(Seq(str(s[endLocInRefSeq-(PAM_length+1):endLocInRefSeq-1]), IUPAC.ambiguous_dna).reverse_complement())
					else:	# Cpf1-like
						spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna)
						spacerAsStr = str(spacerRC.reverse_complement())
						exactPAM = str(Seq(str(s[startLocInRefSeq:startLocInRefSeq+PAM_length]), IUPAC.ambiguous_dna).reverse_complement())
						

					GCcontent = SeqUtils.GC(spacerAsStr);
					listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, 
								cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent]				

					listSpacer.append(listItem)
					listDistBetweenSpacers.append(float(distFromPrevSpacer))
					prevStartLocInRefSeq=startLocInRefSeq		

		print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) 
		spacerNumTotal=spacerNumTotal+spacerNum;
	
	arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers)
	meanDist = np.mean(arrDistBetweenSpacers)
	return (listSpacer, spacerNumTotal, meanDist)
예제 #47
0
파일: util.py 프로젝트: bmcorser/Azimuth
def target_genes_stats(genes=['HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101']):
    for gene in genes:
        seq = get_gene_sequence(gene)
        if seq != None:
            print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
예제 #48
0
    for randomRec in range(1,2):
        record = records[random.randint(1, len(records))]
        newRecord = SeqRecord(record.seq)

        #writing Header
        newRecord.seq.alphabet = generic_dna
        newRecord.id = record.id
        newRecord.name = record.name
        newRecord.description = record.description
        recordSeq = str(record.seq)

        for feature in featureStatistic_container:
            if feature not in ["PBS", "STF"]:
                for variation in featureStatistic_container[feature]:
                    featureSeq = str(variation.seq)
                    occurrence = SeqUtils.nt_search(recordSeq, featureSeq)
                    writeFeature(strand=1)

                    featureSeqComplement = str(variation.seq.complement())
                    occurrence = SeqUtils.nt_search(recordSeq, featureSeqComplement)
                    writeFeature(strand=-1)
            else:
                if(feature == "STF"):
                    writeSTF()

                if(feature == "PBS"):
                    writePBS()

        SeqIO.write(newRecord, output_handle, "genbank")

예제 #49
0
파일: forms.py 프로젝트: uycire/goldenworm
    def clean(self):
        '''
        Clean data, adding an unsaved InchwormAssembly model ('assembly') and
        a list of stages ('stages') to self.cleaned_data
        '''
        cleaned_data = super(PathwayForm, self).clean()

        # Don't do anything if some fields are missing
        if not all(x in cleaned_data.keys() for x in
                   ['file', 'rbs_annotation_type', 'cds_annotation_type']):
            return cleaned_data

        def validate_contiguity(features):
            for i in range(len(features) - 1):
                if features[i].location.end != features[i+1].location.start:
                    raise forms.ValidationError(
                        'Features {} (of type {}) and {} (of type {}) must be contiguous.'.format(
                            features[i].qualifiers['label'][0],
                            features[i].type,
                            features[i+1].qualifiers['label'][0],
                            features[i+1].type,
                        ))

        record = SeqIO.read(cleaned_data['file'], 'genbank')
        feature_dict = {
            (feature.qualifiers['label'][0], feature.type): feature
            for feature in record.features
        }

        # Make sure all the required features are present
        pathway_features = []
        for stage_name in self.stage_names:
            rbs_key = (stage_name, cleaned_data['rbs_annotation_type'])
            cds_key = (stage_name, cleaned_data['cds_annotation_type'])
            try:
                pathway_features.append(feature_dict[rbs_key])
                pathway_features.append(feature_dict[cds_key])
            except KeyError as e:
                raise forms.ValidationError(
                    'Stage {} has no feature of type {}.'.format(*e.args[0]))

        # Make sure all the features are contiguous
        validate_contiguity(pathway_features)

        # Save all the annealable sequences
        annealable_seqs = []
        for i, stage_name in enumerate(self.stage_names):
            cds_feature = pathway_features[2*i + 1]

            annealable_seq = None

            for sequence_context in self.sequence_contexts:
                annealable_seq_name = '{} from {}'.format(
                    stage_name, sequence_context['name'])

                sequence_context['file'].seek(0)
                context_record = SeqIO.read(sequence_context['file'], 'genbank')

                search_result = SeqUtils.nt_search(
                    str(context_record.seq),
                    str(cds_feature.extract(record).seq),
                )

                if len(search_result) > 1:
                    annealable_seq = Gene(
                        file=sequence_context['file'],
                        start=search_result[1] + 1,
                        end=search_result[1] + len(cds_feature),
                        strand=1,
                        name=annealable_seq_name,
                        type=Gene.ANNEALABLE_SEQ,
                    )
                    annealable_seq.save()
                    break

                # No forward match found, so search the reverse strand
                rev_search_result = SeqUtils.nt_search(
                    str(context_record.seq),
                    str(cds_feature.extract(record).seq.reverse_complement()),
                )
                if len(rev_search_result) > 1:
                    annealable_seq = Gene(
                        file=sequence_context['file'],
                        start=rev_search_result[1] + 1,
                        end=rev_search_result[1] + len(cds_feature),
                        strand=-1,
                        name=annealable_seq_name,
                        type=Gene.ANNEALABLE_SEQ,
                    )
                    annealable_seq.save()
                    break

            if annealable_seq is None:
                # No sequence context matched, so do non-nested PCR directly off
                # the coding sequence
                seq_file = ContentFile('')
                annealable_seq = Gene(
                    file=seq_file,
                    start=1,
                    end=len(cds_feature),
                    strand=1,
                    name=stage_name,
                    type=Gene.ANNEALABLE_SEQ,
                )
                annealable_seq.save()
                seq_record = cds_feature.extract(record)
                seq_record.id = ''
                seq_record.name = ''
                SeqIO.write(seq_record, seq_file, 'genbank')
                annealable_seq.file.save(stage_name, seq_file)

            annealable_seqs.append(annealable_seq)

        # Save the genome
        if len(record[:pathway_features[0].location.start]) < self.fwd_ha_len:
            raise forms.ValidationError(
                '5’ genome context must be at least {} bp long.'.format(
                    self.fwd_ha_len))
        if len(record[pathway_features[-1].location.end:]) < self.rev_ha_len:
            raise forms.ValidationError(
                '3’ genome context must be at least {} bp long.'.format(
                    self.rev_ha_len))
        genome_record = record[:pathway_features[0].location.start] + \
                        record[pathway_features[-1].location.end:]
        genome_record.name = 'genome'
        genome_file = ContentFile('')
        genome = Gene(
            file=genome_file,
            start=pathway_features[0].location.start + 1,
            end=pathway_features[0].location.start,
            strand=1,
            name='Genome context',
        )
        genome.save()
        SeqIO.write(genome_record, genome_file, 'genbank')
        genome.file.save('genome', genome_file)

        # Save the stages
        cleaned_data['stages'] = []
        for i, stage_name in enumerate(self.stage_names):
            rbs_feature = pathway_features[2*i]
            stage = Stage(
                degeneracy=str(rbs_feature.extract(record).seq),
                annealable_seq = annealable_seqs[i],
                selection_cassette=self.selection_cassettes[i],
                name=stage_name,
            )
            cleaned_data['stages'].append(stage)
            stage.save()

        # Save the InchwormAssembly object
        cleaned_data['assembly'] = InchwormAssembly(
            genome=genome,
            enzyme=self.enzyme,
            library_size=self.library_size,
            dna_required=self.dna_required,
            fwd_ha_len=self.fwd_ha_len,
            rev_ha_len=self.rev_ha_len,
        )
        return cleaned_data
예제 #50
0
파일: views.py 프로젝트: uycire/goldenworm
    def get_context_data(self, **kwargs):
        output = self.object.output
        primers = self.object.primers

        library_sizes = self.get_library_sizes()

        primer_names_by_sequence = dict()
        for name, sequence in primers:
            primer_names_by_sequence[sequence] = name
        def primer_name(primer):
            return primer_names_by_sequence[str(primer.full_seq().seq)]

        for i, stage_output in enumerate(output):
            stage_output['gg_primer_names'] = [
                (primer_name(primer1), primer_name(primer2))
                for primer1, primer2 in stage_output['gg'].primers
            ]
            stage_output['integration_primer_names'] = [
                primer_name(primer)
                for primer in stage_output['insert'].generate_primers()
            ]
            stage_output['phenotype'] = \
                self.object.stages.order_by('pk')[i].selection_cassette.phenotype
            if library_sizes:
                stage_output['dna_required'] = \
                    library_sizes[i] * self.object.dna_required

        # Compile unique Golden Gate PCR reactions for the tabular view
        gg_pcrs_by_primers_and_template = dict()
        gg_pcr_details = []
        for i, stage_output in enumerate(output):
            for j in range(3):
                primer_names = map(primer_name, stage_output['gg'].primers[j])
                primer_names_and_template = tuple(
                    primer_names + [str(
                        stage_output['gg'].genes[j].subrecord().seq.upper())])
                if primer_names_and_template in gg_pcrs_by_primers_and_template.keys():
                    continue
                else:
                    # Get length of PCR product
                    primer1 = stage_output['gg'].primers[j][0]
                    primer2 = stage_output['gg'].primers[j][1]
                    search_template = str(
                        stage_output['gg'].genes[j].subrecord().seq.upper())
                    forward_search_result = SeqUtils.nt_search(
                        search_template,
                        primer1.anneal_seq().upper(),
                    )
                    reverse_search_result = SeqUtils.nt_search(
                        search_template,
                        primer2.anneal_seq().reverse_complement().upper(),
                    )

                    assert len(forward_search_result) > 1 and \
                           len(reverse_search_result) > 1

                    # Get name of template
                    stage = self.get_object().stages.order_by('pk')[i]
                    if j == 0:
                        template_name = stage.annealable_seq.name
                    elif j == 1:
                        template_name = stage.selection_cassette.name
                    else:
                        template_name = 'Genome'

                    # Get primer Tm
                    forward_tm = recombineering.utils.Tm(
                        str(primer1.anneal_seq().seq))
                    reverse_tm = recombineering.utils.Tm(
                        str(primer2.anneal_seq().seq))

                    details = {
                        'product': 'gg{}-{}'.format(i+1, j+1),
                        'size': len(primer1.overhang) +
                                (reverse_search_result[1] -
                                 forward_search_result[1]) +
                                len(primer2.full_seq()),
                        'primer_names': primer_names_and_template,
                        'template': template_name,
                        'forward_tm': forward_tm,
                        'reverse_tm': reverse_tm,
                    }
                    gg_pcrs_by_primers_and_template[
                        primer_names_and_template] = details
                    gg_pcr_details.append(details)

        # Compile information about second-round PCRs
        round2_pcr_details = []
        for i, stage_output in enumerate(output):
            insert = stage_output['insert']
            insert_len = sum([
                insert.fwd_ha_len,
                len(insert.degeneracy),
                len(insert.sequence),
                insert.rev_ha_len,
            ])

            details = {
                'product': 'stage{}'.format(i+1),
                'size': insert_len,
                'primer_names': map(primer_name, insert.generate_primers()),
                'template': 'gg{}'.format(i+1),
                'forward_tm': recombineering.utils.Tm(
                    str(insert.generate_primers()[0].anneal_seq().seq)),
                'reverse_tm': recombineering.utils.Tm(
                    str(insert.generate_primers()[1].anneal_seq().seq)),
            }

            if library_sizes:
                details['dna_required'] = \
                    library_sizes[i] * self.object.dna_required

            round2_pcr_details.append(details)

        # Determine what goes into which Golden Gate reaction
        gg_details = []
        for i, stage_output in enumerate(output):
            fragments = []
            for j, (primer1, primer2) in enumerate(stage_output['gg'].primers):
                template = str(stage_output['gg'].genes[j].subrecord().seq.upper())
                primer_names_and_template = (
                    primer_name(primer1),
                    primer_name(primer2),
                    template,
                )
                fragments.append(
                    gg_pcrs_by_primers_and_template[primer_names_and_template]['product'])

            gg_details.append({
                'product': 'gg{}'.format(i+1),
                'size': len(stage_output['gg'].product),
                'fragments': fragments,
            })

        # Transformation details
        transformation_details = []
        for i in range(len(output)):
            stage = self.get_object().stages.order_by('pk')[i]

            transformation_details.append({
                'insert_name': round2_pcr_details[i]['product'],
                'phenotype': stage.selection_cassette.phenotype,
            })

        context = super(OutputView, self).get_context_data(**kwargs)
        context['output'] = output
        context['primers'] = primers

        context['gg_pcr_details'] = gg_pcr_details
        context['gg_details'] = gg_details
        context['round2_pcr_details'] = round2_pcr_details
        context['transformation_details'] = transformation_details

        return context
예제 #51
0
if fastafile=="test3prime.fasta":
    output_fh_name="output2.fasta"

output_fh = open(output_fh_name, mode='w+')

output_text_name = "output.txt"
if fastafile=="test3prime.fasta":
    output_text_name="output2.txt"
output_text_fh = open(output_text_name, mode='w+')


for record in parsed:
    try:
        sequence = str(record.seq)
        search = SeqUtils.nt_search(sequence, adapter) #This will search the
        index = int(search[1]) #If it finds the adapter, is the starting index from which it was found.
        adapter_start = index
        adapter_end = index+len_adapter
        count_adapter_found +=1
        total_seq_count+=1
        if removeadapters == "True": #if the value is true, it removes the adapters from the sequences.
            if end_defn=="5":
                record = record[adapter_end:] #If a 5' adapter, you remove adapter from beginning
            elif end_defn=="3":
                record = record[:adapter_start] #If it is a 3' adapter, you remove the adapter at the end
        elif removeadapters == "False": #if the value is false, it does not remove the adapters from the sequences.
            record = record
        SeqIO.write(record, output_fh, format="fasta") #No matter what, write the reads.
    except IndexError:
        count_adapter_not_found+=1
예제 #52
0
 def molecular_weight(self):
     return SeqUtils.molecular_weight(self.sequence, 'protein')