Esempio n. 1
0
def dig_finder():
    dirn = "./150126_BaG_Seq/"
    vector_seq = Fasta("./pZX-vector.fa").Seqs[0]
    stat_name = "chunk_dig_stat.csv"
    stat_dir = "/Users/xuz02/Google_Drive/workspace/Python/150126_BaG_Seq/"
    vector_seq = vector_seq.upper()
    vector_size = len(vector_seq)
    min_frag = 100
    max_dig = 5
    min_dig = 2

    enzyme_dict = {
        "EcoRI": "GAATTC",
        "NotI": "GCGGCCGC",
        "BamHI": "GGATCC",
        "HindIII": "AAGCTT",
        "KpnI": "GGTACC",
        "SacI": "GAGCTC",
        "SalI": "GTCGAC",
        "SpeI": "ACTAGT",
        "NheI": "GCTAGC",
        "AgeI": "ACCGGT",
        "BsaI": "GGTCTC",
        "EcoRV": "GATATC",
        "NcoI": "CCATGG",
        "AgeI": "ACCGGT",
        "PstI": "CTGCAG",
        "XbaI": "TCTAGA",
    }
    enzyme_dict_II = {"BsaI": "GGTCTC", "BsmBI": "CGTCTC"}
    stat_fp = open(stat_dir + stat_name, "w")
    stat_fp.close()
Esempio n. 2
0
def chr04_pcrtag_stat():
    """Stat the pcrtags over the minichunks."""

    import re
    dirn = "/workspace/Python/161212_megachunk_csPCR/"
    primers = "PCRtags_syn.csv"
    mega = "synIV_mega.fa"
    output = "chr04_pcrtag_stat.csv"
    mega_f = Fasta(dirn + mega)
    primers_fp = open(dirn + primers, "r")
    op = open(dirn + output, "w")

    primer_ls = []
    csv = primers_fp.read()
    primer_ls = re.split("\r|,", csv)
    size = len(primer_ls) / 2
    stats = []
    for i in range(size):
        if "synF" in primer_ls[2 * i]:
            p_seq = primer_ls[2 * i + 1].upper()
        else:
            if "\n" not in primer_ls[2 * i + 1]:
                p_seq = Seq_Analyzer(primer_ls[2 * i + 1]).rcSeq().upper()
        for j in range(len(mega_f.Names)):
            sq = mega_f.Seqs[j].upper()
            if p_seq in sq:
                _pos = sq.find(p_seq)
                stats.append([
                    mega_f.Names[j], primer_ls[2 * i], primer_ls[2 * i], _pos
                ])
        print "%s done!" % primer_ls[2 * i]
    for item in stats:
        op.write("%s, %s, %s, %d\n" % (item[0], item[1], item[2], item[3]))
    op.close()
    mega_f.close()
Esempio n. 3
0
def info(args):
    """
    >>> info(['tests/data/three_chrs.fasta'])
    <BLANKLINE>
    tests/data/three_chrs.fasta
    ===========================
    >chr3 length:3600 
    >chr2 length:80 
    >chr1 length:80 
    <BLANKLINE>
    3760 basepairs in 3 sequences
    """
    parser = optparse.OptionParser("""\
   print headers and lengths of the given fasta file in order of length. e.g.:
        pyfasta info --gc some.fasta""")

    parser.add_option("-n",
                      "--n",
                      type="int",
                      dest="nseqs",
                      help="max number of records to print. use -1 for all",
                      default=20)
    parser.add_option("--gc",
                      dest="gc",
                      help="show gc content",
                      action="store_true",
                      default=False)
    options, fastas = parser.parse_args(args)
    if not (fastas):
        sys.exit(parser.print_help())
    import operator

    for fasta in fastas:
        f = Fasta(fasta)
        info = [(k, len(seq)) for k, seq in f.iteritems()]

        total_len = sum(l for k, l in info)
        nseqs = len(f)
        if options.nseqs > -1:
            info = sorted(info, key=operator.itemgetter(1), reverse=True)
            info = info[:options.nseqs]
        else:
            info.sort()

        print("\n" + fasta)
        print("=" * len(fasta))
        for k, l in info:
            gc = ""
            if options.gc:
                seq = str(f[k]).upper()
                g = seq.count('G')
                c = seq.count('C')
                gc = 100.0 * (g + c) / float(l)
                gc = "gc:%.2f%%" % gc
            print((">%s length:%i " % (k, l)) + gc)

        if total_len > 1000000:
            total_len = "%.3fM" % (total_len / 1000000.)
        print()
        print("%s basepairs in %i sequences" % (total_len, nseqs))
Esempio n. 4
0
	def cds_fasta(self,genomefasta_path,cds_outpath):
		genome = Fasta(genomefasta_path)
		genome.readFasta()
		genomeDict = genome._fasta
		forward = {}
		reverse = {}
		for record in self.all_gff().values():
			start,end = map(int,[record.start(),record.end()])
			if record.feature() == "mRNA":
				seq2 = ''
				seq3 = ''
			elif record.feature() == "CDS":
				key = self.cds_pattern.search(record.attribute()).group("id")
				out = genomeDict[record.seqid()].get_seq()[start-1:end]
				seq2 += out
				seq3 += out
				if record.strand() == "+":
					forward[key] = seq2
				else:
					reverse[key] = seq3
		for key in reverse.keys():
			seq = reverse[key]
			seq = seq.replace('A','{A}').replace('T','{T}').replace('C','{C}').replace('G','{G}')
			seq = seq.format(A='T', T='A', C='G', G='C')[::-1]
			forward[key] = seq
		CDS = open(cds_outpath,'w')
		for key in sorted(forward.keys()):
			fa = fasta_record(key,forward[key])
			CDS.writelines(fa.fasta_parse())
		CDS.close()
Esempio n. 5
0
def linkerPCR_primers():
    """ Make Linker PCR Primers."""
    chunks = "yeast_chr01_chunks.FA"
    dirn = "/Users/xuz02/Google_Drive/workspace/Python/150218_Leslie/"
    bacbone = "pZX4_lin.fa"
    output = "liner_primers.csv"
    v_fasta = Fasta(dirn + bacbone)
    c_fasta = Fasta(dirn + chunks)
    o_fp = open(dirn + output, "w")
    left = 58
    right = 57
    reverse = "ggccggccccagcttttgttc"
    forward = "cggccggccctatagtgagtcg"
    o_fp.write("Name, Forward Primer, Reverse Primer\n")
    for n in range(len(c_fasta.Seqs)):
        f_primer = c_fasta.Seqs[n][-right:] + forward
        r_primer = Seq_Analyzer(c_fasta.Seqs[n][:left]).rcSeq() + reverse
        name = c_fasta.Names[n]
        fn = "pZX4_" + name[:19] + ".fasta"
        fp = open(dirn + fn, "w")
        fp.write(">%s\n" % name)
        fp.write(c_fasta.Seqs[n] + v_fasta.Seqs[0])
        fp.close()
        o_fp.write("%s, %s, %s\n" % (name[:19], f_primer, r_primer))
    o_fp.close()
Esempio n. 6
0
def main():
    '''
    '''
    opts = options()
    dScafs1 = {}
    fasta1 = Fasta(opts.fasta1)
    for i in xrange(len(fasta1.headers)):
        header = fasta1.headers[i].split()[0]
        dScafs1[header] = fasta1.seqs[i].upper()
    cnt = 0
    ids = set()
    fasta2 = Fasta(opts.fasta2)
    for i in xrange(len(fasta2.headers)):
        header = fasta2.headers[i].split()[0]
        fasta2.seqs[i] = fasta2.seqs[i].upper()
        tmpCnt = 0
        for j in xrange(min(len(fasta2.seqs[i]), len(dScafs1[header]))):
            if fasta2.seqs[i][j] != dScafs1[header][j]:
                cnt += 1
                tmpCnt += 1
                ids.add(header)
        lenDiff = abs(len(fasta2.seqs[i]) - len(dScafs1[header]))
        cnt += lenDiff
    print cnt
    print >> sys.stderr, '\t'.join(sorted(ids))
Esempio n. 7
0
def extract(args):
    """
    >>> extract(['--fasta', 'tests/data/three_chrs.fasta', 'chr2'])
    TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT
    """

    parser = optparse.OptionParser("""extract some sequences from a fasta file. e.g.:
               pyfasta extract --fasta some.fasta --header at2g26540 at3g45640""")
    parser.add_option("--fasta", dest="fasta", help="path to the fasta file")
    parser.add_option("--header", dest="header", help="include headers", action="store_true", default=False)
    parser.add_option("--exclude", dest="exclude", help="extract all sequences EXCEPT those listed", action="store_true", default=False)
    parser.add_option("--file", dest="file", help=\
                      "if this flag is used, the sequences to extract" \
                      " are read from the file specified in args"
                      , action="store_true", default=False)
    parser.add_option("--space", dest="space", action="store_true", help=\
                      "use the fasta identifier only up to the space as the key",
                      default=False)
    options, seqs = parser.parse_args(args)
    if not (options.fasta and len(seqs)):
        sys.exit(parser.print_help())

    key_fn = (lambda k: k.split()[0]) if options.space else None
    f = Fasta(options.fasta, key_fn=key_fn)
    if options.file:
        seqs = (x.strip() for x in open(seqs[0]))
    if options.exclude:
        seqs = sorted(frozenset(f.iterkeys()).difference(seqs))

    for seqname in seqs:
        seq = f[seqname]
        if options.header:
            print ">%s" % seqname
        print seq
Esempio n. 8
0
def compare_CDS():
    import Codon
    import re
    codon_table = Codon.c_table()
    dirn = "/Workplace/Python/PCRtagChange"
    wt = Fasta(dirn + "yeast_chr04_0_00_genes.fa")
    syn = Fasta(dirn + "yeast_chr04_3_66_genes.fa")
Esempio n. 9
0
def chr01_pcrtag_stat():
    """Stat the pcrtags over the minichunks."""

    import re
    dirn = "./160521_JL/"
    primers = "csPCR_primers.csv"
    miniC = "synI_mini_chunks.fasta"
    output = "chr01_pcrtag_stat.csv"
    miniC_f = Fasta(dirn + miniC)
    primers_fp = open(dirn + primers, "r")
    op = open(dirn + output, "w")

    primer_ls = []
    csv = primers_fp.read()
    primer_ls = re.split("\r|,", csv)
    size = len(primer_ls) / 3
    stats = []
    for i in range(size):
        p_seq = primer_ls[3 * i + 1].lower()
        for j in range(len(miniC_f.Names)):
            if p_seq in miniC_f.Seqs[j]:
                _pos = miniC_f.Seqs[j].find(p_seq)
                stats.append([
                    miniC_f.Names[j], primer_ls[3 * i], primer_ls[3 * i + 2],
                    _pos
                ])
    for item in stats:
        op.write("%s, %s, %s, %d\n" % (item[0], item[1], item[2], item[3]))
    op.close()
    miniC_f.close()
Esempio n. 10
0
def main():
    '''
    '''
    opts = options()
    fasta = Fasta(opts.fasta)
    for i in xrange(len(fasta.headers)):
        print ">%s" % fasta.header(i)
        print fasta.seq(i)
Esempio n. 11
0
def execute():
    fasta = Fasta( "rosalind_tran.txt" )
    s1 = fasta.get_segments()[0].get_sequence().upper()
    s2 = fasta.get_segments()[1].get_sequence().upper()
    print( "s1",s1 )
    print( "s2",s2)
    ratio = compute_ratio( s1, s2 )
    print( "ratio=" + str(ratio) )
Esempio n. 12
0
def readinput( path ):
    """
    Read input from FASTA file.
    """
    fasta = Fasta( path )
    segs = fasta.get_segments()
    s = segs[0].get_sequence()
    t = segs[1].get_sequence()
    return s,t
Esempio n. 13
0
def execute():
    fasta = Fasta( "rosalind_revp.txt" )
    output = open( "output_revp.txt", "w" )
    dna = fasta.get_segments()[0].get_sequence()
    #print( dna )
    restrictions = find_restrictions( dna )
    for i in range(len(restrictions)):
        print( str(restrictions[i][0])+" "+str(restrictions[i][1]))
        output.write(str(restrictions[i][0])+" "+str(restrictions[i][1]))
        output.write("\n")
Esempio n. 14
0
def readinput( path ):
    """
    Read all the segments from FASTA file. Return Fasta object.
    """
    fasta = Fasta( path )
    for i in range( len(fasta.get_segments()) ):
        seg = fasta.get_segments()[i]
        seq = seg.get_sequence()
        print( seg.get_header() + "\n" + seq )
    return fasta
Esempio n. 15
0
def info(args):
    """
    >>> info(['tests/data/three_chrs.fasta'])
    <BLANKLINE>
    tests/data/three_chrs.fasta
    ===========================
    >chr3 length:3600 
    >chr2 length:80 
    >chr1 length:80 
    <BLANKLINE>
    3760 basepairs in 3 sequences
    """
    parser = optparse.OptionParser("""\
   print headers and lengths of the given fasta file in order of length. e.g.:
        pyfasta info --gc some.fasta""")

    parser.add_option("-n", "--n", type="int", dest="nseqs", 
                      help="max number of records to print. use -1 for all",
                      default=20)
    parser.add_option("--gc", dest="gc", help="show gc content",
                      action="store_true", default=False)
    options, fastas = parser.parse_args(args)
    if not (fastas):
        sys.exit(parser.print_help())
    import operator

    for fasta in fastas:
        f = Fasta(fasta)
        info = [(k, len(seq)) for k, seq in f.iteritems()]

        total_len = sum(l for k, l in info)
        nseqs = len(f)
        if options.nseqs > -1:
            info = sorted(info,  key=operator.itemgetter(1), reverse=True)
            info = info[:options.nseqs]
        else:
            info.sort()

        print "\n" + fasta
        print "=" * len(fasta)
        for k, l in info:
            gc = ""
            if options.gc:
                seq = str(f[k]).upper()
                g = seq.count('G')
                c = seq.count('C')
                gc = 100.0 * (g + c) / float(l)
                gc = "gc:%.2f%%" % gc
            print (">%s length:%i " % (k, l)) + gc

        if total_len > 1000000:
            total_len = "%.3fM" % (total_len / 1000000.)
        print
        print "%s basepairs in %i sequences" % (total_len, nseqs)
Esempio n. 16
0
def outputFasta(fishPath, Bait, Except):
    """ output fasta from fish pool """
    fasta = Fasta(fishPath).readFasta()
    for ID, record in fasta.items():
        line_out = record.fasta_parse()
        if not Except:
            if Bait.get(ID):
                print(line_out.strip())
        else:
            if not Bait.get(ID):
                print(line_out.strip())
Esempio n. 17
0
def cons(input_string):
  '''http://rosalind.info/problems/cons/'''
  fasta = Fasta(input_string)
  matrix = []
  read_matrix = [read_string for label, read_string in fasta.all()]
  profile = consensus_profile(read_matrix)
  sequence = consensus_sequence(profile)
  print_lines = [''.join(sequence)]
  print_lines += ['{}: {}'.format(PROFILE_MATRIX_KEY_INDEXES[i], ' '.join(map(str,read))) \
    for i, read in enumerate(profile)]
  print('\n'.join(print_lines))
Esempio n. 18
0
def fill_bins(fasta_filename, index, out, prefix):
    fasta = Fasta(fasta_filename)

    # Read sequence by sequence
    for sequence in fasta.read():
        if sequence["id"] in index:
            # get the sequence groups
            groups = index[sequence["id"]]

            # Write the sequence into the right bins
            for group in groups:
                with open("{}/{}{}.fa".format(out, prefix, group), "a") as fw:
                    fw.write(">{}\n{}\n".format(sequence["id"],
                                                sequence["value"]))
Esempio n. 19
0
def test_select_enzyme():
    dirn = "/Users/xuz02/Google_Drive/workspace/Python/test_dig/"
    seq_name_ls = []
    seq_ls = []
    for fn in os.listdir(dirn):
        if fn[-5:].upper() == "FASTA":
            fn = dirn + fn
            fa = Fasta(fn)
            seq_ls.append(fa.Seqs[0])
            seq_name_ls.append(fa.Names[0])
    enzymes, dig_band_ls, re_name_ls, n_verified_ls =\
        select_restriction_enyzme(seq_ls)
    # Write output csv file
    # Headers
    print enzymes
    for i in range(len(enzymes)):
        ouput = dirn + "re_" + re_name_ls[enzymes[i]] + ".csv"
        fop = open(ouput, "w")
        for n in range(len(seq_name_ls)):
            fop.write("%s, " % seq_name_ls[n])
            bands = dig_band_ls[n][enzymes[i]]
            for band in bands:
                fop.write("%d, " % band)
            fop.write("\n")
        fop.close()
    output = dirn + "non_verified.csv"
    fop = open(output, "w")
    for i in n_verified_ls:
        fop.write(seq_name_ls[i] + "\n")
    fop.close()
Esempio n. 20
0
def main():
    '''
    '''
    opts = options()
    dScafs = {}
    fasta = Fasta(opts.fasta)
    for i in xrange(len(fasta.headers)):
        header = fasta.headers[i].split()[0]
        dScafs[header] = [nt.upper() for nt in fasta.seqs[i]]

    cnt = 0
    with gzip.open(opts.snps) as handle:
        for line in handle:
            items = line.strip().split('\t')
            scafId, pos = items[0], int(items[2])
            scafId = '_'.join(scafId.split("_")[:2])
            indel = False
            if items[3] == "." or items[4] == ".":
                indel = True
            if indel == False:
                fromNt, toNt = items[3].upper(), items[4].upper()
                if dScafs[scafId][pos - 1].upper() == fromNt:
                    dScafs[scafId][pos - 1] = toNt
                    cnt += 1
                else:
                    print >> sys.stderr, "Does not match!!! %s vs %s" % (
                        fromNt, dScafs[scafId][pos - 1])
                    dScafs[scafId][pos - 1] = toNt
                    cnt += 1
    for key in dScafs:
        print ">%s\n%s" % (key, ''.join(dScafs[key]))
    print >> sys.stderr, "SNP count: %d" % cnt
Esempio n. 21
0
def batch_PCR_Primers_at_End():

    folder = "/workspace/Python/170116_SynIV/minichunks/"
    savefile = "/workspace/Python/170116_SynIV/primers.csv"

    L_res = 2
    R_res = 2
    Tm_ls = range(52, 59)
    minLen = 20
    maxLen = 50
    primer_ls = []
    output = open(savefile, "w+")

    _len = len(Tm_ls)

    for fn in os.listdir(folder):
        if "fasta" in fn:
            fa = Fasta(folder + fn)
            length = len(fa.Seqs)
            for n in range(length):
                seq = fa.Seqs[n]
                name = fa.Names[n]
                F_primer, R_primer, Tm_bin = Seq_Analyzer(seq).\
                    Find_Primer_at_Ends(L_res, R_res, Tm_ls, minLen, maxLen)
                primer_ls.append([name, F_primer, R_primer, Tm_bin])

    for primer_sub_ls in primer_ls:
        for n in range(_len):
            if primer_sub_ls[3][n]:
                output.write(str(primer_sub_ls[0][:-1]) + ",")
                for i in [1, 2]:
                    for j in range(4):
                        output.write(str(primer_sub_ls[i][n][j]) + ",")
                output.write(str(primer_sub_ls[2][n][4]) + ",")
                output.write("\n")
Esempio n. 22
0
def execute():
    """
    text_file = open( "rosalind_hamm.txt", "r")
    s = text_file.readline().rstrip()
    t = text_file.readline().rstrip()
    text_file.close()
    """

    from fasta import Fasta

    fasta = Fasta( "rosalind_tran.txt" )
    s = fasta.get_segments()[0].get_sequence().upper()
    t = fasta.get_segments()[1].get_sequence().upper()
    if len(s) != len(t):
        raise Exception( "lengths do not match" )
    
    print( hamm_dist(s,t) )
Esempio n. 23
0
def extract(args):
    """
    >>> extract(['--fasta', 'tests/data/three_chrs.fasta', 'chr2'])
    TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT
    """

    parser = optparse.OptionParser(
        """extract some sequences from a fasta file. e.g.:
               pyfasta extract --fasta some.fasta --header at2g26540 at3g45640"""
    )
    parser.add_option("--fasta", dest="fasta", help="path to the fasta file")
    parser.add_option("--header",
                      dest="header",
                      help="include headers",
                      action="store_true",
                      default=False)
    parser.add_option("--exclude",
                      dest="exclude",
                      help="extract all sequences EXCEPT those listed",
                      action="store_true",
                      default=False)
    parser.add_option("--file", dest="file", help=\
                      "if this flag is used, the sequences to extract" \
                      " are read from the file specified in args"
                      , action="store_true", default=False)
    parser.add_option("--space", dest="space", action="store_true", help=\
                      "use the fasta identifier only up to the space as the key",
                      default=False)
    options, seqs = parser.parse_args(args)
    if not (options.fasta and len(seqs)):
        sys.exit(parser.print_help())

    key_fn = (lambda k: k.split()[0]) if options.space else None
    f = Fasta(options.fasta, key_fn=key_fn)
    if options.file:
        seqs = (x.strip() for x in open(seqs[0]))
    if options.exclude:
        seqs = sorted(frozenset(f.iterkeys()).difference(seqs))

    for seqname in seqs:
        seq = f[seqname]
        if options.header:
            print(">%s" % seqname)
        print(seq)
Esempio n. 24
0
 def openfasta(self, fn):
     if fn.__class__.__name__ != "str":
         fn = self.dirn + "/" + str(fn.text())
     fasta = Fasta(fn)
     data = fasta.Seqs[0]
     self.lName.setText(fasta.Names[0])
     self.currentSeq = data
     self.SeqEdit.setPlainText(data)
     self.dirn = os.path.dirname(fn)
     self.tabWidget.setCurrentWidget(self.tab_seq_analyzer)
Esempio n. 25
0
def flatten(args):
    """
    >>> flatten(['tests/data/three_chrs.fasta'])
    """
    parser = optparse.OptionParser(
        """flatten a fasta file *inplace* so all later access by pyfasta will use that flattend (but still viable) fasta file"""
    )
    _, fasta = parser.parse_args(args)
    for fa in fasta:
        f = Fasta(fa, flatten_inplace=True)
Esempio n. 26
0
def length_reporter():
    dirn = "/Users/xuz02/Google_Drive/Project Data/ORDERS/"
    stat = "length_stat_0819.csv"
    stat_fp = open(dirn + stat, "w")
    for fn in os.listdir(dirn):
        if fn != "20150819_TWIST_LabOrder.txt":
            continue
        fasta = Fasta(dirn + fn)
        for n in range(len(fasta.Seqs)):
            _len = len(fasta.Seqs[n])
            stat_fp.write(fasta.Names[n] + "," + str(_len) + "," + "\n")
    stat_fp.close()
Esempio n. 27
0
 def n50(self, assembly):
     '''
     '''
     fasta = Fasta("%s" % assembly)
     n50, nValCnt = None, 0
     fasta.seqs.sort(key=len)
     for seq in fasta.seqs:
         nValCnt += len(seq)
         if nValCnt >= fasta.totalLen / 2:
             n50 = len(seq)
             break
     return n50
Esempio n. 28
0
def main():
    '''
    '''
    opts = options()
    refSize, refSeqCnt = 0, 0
    try:
        ref = Fasta(opts.refGen)
        refSize, refSeqCnt = ref.totalLen, len(ref.headers)
    except IOError:
        pass

    geneIds = set()
    if opts.contamination != "":
        handle = open(opts.contamination, 'r')
        for line in handle:
            myId = line.strip().split('\t')[0]
            if myId[0] != '#':
                geneIds.add(myId)
        handle.close()
    geneIds = list(geneIds)
    myGeneIds = None

    genomeSize = None
    if opts.scafs != '':
        fastaS = Fasta(opts.scafs)
        genomeSize = detailsFasta("Genome Scaffolds", fastaS, refSize,
                                  refSeqCnt)
    if opts.genes != '':
        fastaG = Fasta(opts.genes)
        fastaG.rmGenes(geneIds)
        myGeneIds = set([header.split()[0] for header in fastaG.headers])
        detailsFasta("Genome genes", fastaG)
Esempio n. 29
0
 def renameFasta(self, iName, oName, mName, prefix="scaf"):
     '''
     '''
     fasta = Fasta(iName)
     with open(mName, 'w') as handleM:  # Mapping
         with open(oName, 'w') as handleW:  # Output
             cnt = 1
             for i in xrange(len(fasta.headers)):
                 newHeader = "%s%ds" % (prefix, cnt)
                 handleW.write(">%s\n" % newHeader)
                 handleW.write("%s\n" % fasta.seqs[i])
                 handleM.write("%s\t%s\n" % (newHeader, fasta.headers[i]))
                 cnt += 1
Esempio n. 30
0
	def mRNA_fasta(self,genomefasta_path,mRNAfasta_path):
		genome = Fasta(genomefasta_path)
		genome.readFasta()
		genomeDict = genome._fasta
		forward = {}
		for record in self.all_gff().values():
			start,end = int(record.start()),int(record.end())
			if record.feature() == "mRNA":
				key = self.mRNA_pattern.search(record.attribute()).group("id")
				seq = genomeDict[record.seqid()].get_seq()[start-1:end]
				if record.strand() == "+":
					forward[key] = seq
				else:
					seq = seq.replace('A','{A}').replace('T','{T}').replace('C','{C}').replace('G','{G}')
					seq = seq.format(A='T', T='A', C='G', G='C')[::-1]
					forward[key] = seq

		mRNAfasta = open(mRNAfasta_path,'w')
		for key in forward:
			fa = fasta_record(key,forward[key])
			mRNAfasta.writelines(fa.fasta_parse())
		mRNAfasta.close()
Esempio n. 31
0
def DL_search_primers():
    dirn = "./150314_DL_PIG/"
    fn1 = "pign_3kbFLK.fa"
    fn2 = "piga_3kbFLK.fa"
    fn3 = "pigl_3kbFLK.fa"
    fn4 = "pigk_3kbFLK.fa"
    ref = Fasta(dirn+fn1).Seqs[0] + Fasta(dirn+fn2).Seqs[0]\
        + Fasta(dirn+fn3).Seqs[0] + Fasta(dirn+fn4).Seqs[0]

    name = "PigL"
    output = "pigL_primers_v3.csv"
    fp = open(dirn + output, "w")
    fa = Fasta(dirn + fn3)
    f_primer, r_primer = tilling_PCR_primers(seq=fa.Seqs[0],
                                             name=name,
                                             ref=ref)
    for p in f_primer:
        print p
        fp.write("%s,%s,%s,%s,%s\n" % (p[0], p[1], p[2], p[3], p[4]))
    for p in r_primer:
        fp.write("%s,%s,%s,%s,%s\n" % (p[0], p[1], p[2], p[3], p[4]))
    fp.close()
Esempio n. 32
0
 def _inferScoreMatrix(self, refName, trimmedReads, identity):
     '''
     '''
     memDir = os.getcwd()
     os.chdir(self.wd)
     fraction = 0.1
     fasta = Fasta(trimmedReads)
     iterCnt, seqCnt = 300, 1000
     self.shell("rm -f *.q runs.txt subset*")
     random.seed(0)
     with open("runs.txt", "w") as handle:
         for i in xrange(iterCnt):
             subFasta = Fasta()
             subIdxs = random.sample(xrange(len(fasta.headers)),
                                     int(seqCnt))
             subFasta.headers, subFasta.seqs = [
                 fasta.headers[j] for j in subIdxs
             ], [fasta.seqs[k] for k in subIdxs]
             subFasta.totalLen = sum([len(fasta.seqs[j]) for j in subIdxs])
             fName1, fName2 = self._splitSeqsIn2Files(
                 subFasta, "subset%d" % (i + 1), fraction)
             handle.write(
                 "lastz_D_Wrapper.pl --target=%s --query=%s --identity=%d\n"
                 % (fName1, fName2, identity))
     self.shell("cat runs.txt | parallel -j %d" % self.pCnt,
                ignoreFailure=True)
     self._median()
     self.shell("rm -f subset*")
     #self.shell("tail -n 5 *.q > scoreMatrix.q")
     if os.path.exists("scoreMatrix.q") == False:
         print >> sys.stderr, "# FATAL ERROR: could not create score matrix for HaploMerger. Exiting ..."
         sys.exit(0)
     first = refName.split(".")[0]
     scoreMatrix = "%s.%s.q" % (first, first)
     print "cp scoreMatrix.q %s" % scoreMatrix
     self.shell("cp scoreMatrix.q %s" % scoreMatrix)
     os.chdir(memDir)
     return scoreMatrix
Esempio n. 33
0
def GC_tmp():
    dirn = "/Users/xuz02/Downloads/"
    fn = "dra_mt.fa"
    # for fn in os.listdir(dirn):
    # 	if fn[-3:].upper() != "TXT" :
    # 		continue
    fasta = Fasta(dirn + fn)
    n = len(fasta.Seqs)
    for i in range(n):
        seq = fasta.Seqs[i]
        name = fasta.Names[i]
        GC_content, outlets = [], []
        GC_content, outlets = Seq_Analyzer(seq).GC_window_analyzer(
            20, 0.70, 0.40)
        if len(outlets) > 0:
            Seq_Analyzer(seq).GC_window_analyzer_visual(20, 0.70, 0.30, name)
Esempio n. 34
0
def count_loxP():
    import re
    dirn = "/Workplace/Python/161122_loxp/"
    synIV = Fasta(dirn + "synIV.fa")
    loxP_seq = "ATAACTTCGTATAATGTACATTATACGAAGTTAT"
    pattern1 = r"[G]ATAACTTCGTATAATGTACATTATACGAAGTTAT[^G]"
    pattern2 = r"[^G]ATAACTTCGTATAATGTACATTATACGAAGTTAT[G]"
    pattern3 = r"GATAACTTCGTATAATGTACATTATACGAAGTTATG"
    ref_seq = synIV.Seqs[0].upper()
    loci_1 = [m.start() for m in re.finditer(pattern1, ref_seq)]
    loci_2 = [m.start() for m in re.finditer(pattern2, ref_seq)]
    loci_3 = [m.start() for m in re.finditer(pattern3, ref_seq)]
    loci = loci_1 + loci_2 + loci_3
    output = open(dirn + "loci_g.csv", "w")
    for l in loci:
        output.write(str(l) + "\n")
Esempio n. 35
0
def minichunk_for_Twist():
    dirn = "/Users/Zhuwei/Google_Drive/Project Data/ORDERS/Twist/"
    stat_name = "minichunk_stat.csv"
    GC_stat = []
    rep_stat = []
    stat_fn = open(dirn + stat_name, "w")
    fasta = Fasta("/Users/Zhuwei/Documents/Order_Twist_140607.fasta")
    for _n in range(fasta.size):
        _GC_ls = Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100)
        print "pass GC\n"
        _mean = numpy.mean(_GC_ls)
        _min = min(_GC_ls)
        _max = max(_GC_ls)
        _std = numpy.std(_GC_ls)
        GC_stat.extend([fasta.Names[_n], _min, _max, _mean, _std])
        _rep_ls = Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3, 4, 2)
        print "pass rep\n"
        rep_stat.append(fasta.Names[_n])
        rep_stat.extend(_rep_ls)
        rep_stat.append(";;")
    # for fn in os.listdir(dirn):
    # 	if fn[-5:].upper() != "FASTA" and fn[-2:].upper() != "FA":
    # 		continue
    # 	fasta=Fasta(dirn+fn)
    # 	for _n in range(fasta.size):
    # 		_GC_ls=Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100)
    # 		print"pass GC\n"
    # 		_mean=numpy.mean(_GC_ls)
    # 		_min=min(_GC_ls)
    # 		_max=max(_GC_ls)
    # 		_std=numpy.std(_GC_ls)
    # 		GC_stat.extend([fasta.Names[_n],_min,_max,_mean,_std])
    # 		_rep_ls=Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3,5,2)
    # 		print "pass rep\n"
    # 		rep_stat.append(fasta.Names[_n])
    # 		rep_stat.extend(_rep_ls)
    # 		rep_stat.append(";;")
    for _x in range(len(GC_stat)):
        stat_fn.write(str(GC_stat[_x]) + ";")
        if _x % 5 == 4:
            stat_fn.write("\n")
    for _x in rep_stat:
        if _x != ";;":
            stat_fn.write(str(_x) + ";")
        else:
            stat_fn.write(";\n")
Esempio n. 36
0
def extract_terminial_seq(length=10000, prefix=""):
    dirn = "/home/zhuwei/cglabarata/comp/"
    fn = "52.fasta"
    outdirn = dirn + "52_ends/"
    fa = Fasta(dirn + fn)
    for n in range(len(fa.Names)):
        _name = fa.Names[n]
        if _name[:3].upper() != "CHR":
            continue
        _seq = fa.Seqs[n]
        _len = len(_seq)
        output_prefix = prefix + _name.split()[0]
        with open(outdirn + output_prefix + "_Left.fa", "w") as op:
            op.write(">%s_left\n" % output_prefix)
            op.write(_seq[:length])
            op.write("\n")
        with open(outdirn + output_prefix + "_Right.fa", "w") as op:
            op.write(">%s_right\n" % output_prefix)
            op.write(_seq[-length:])
            op.write("\n")
Esempio n. 37
0
class Model():
    """Encapsulate a secondary structure model informations."""
    sequences = Fasta(options.FASTA_FILE)

    def __init__(self, name):
        self.name = name

        try:
            self.rna = str(
                recfromtxt(options.MODEL_PATH + self.name + "_seq.txt"))
        except:
            self.rna = self.sequences.sequences.keys()[
                -1]  #by default sequence is the first of the fasta file

        try:
            self.position = recfromtxt(options.MODEL_PATH + self.name +
                                       "_pos.csv",
                                       delimiter=",",
                                       dtype=float)

            self.position[:, 0] -= min(self.position[:, 0])
            self.position[:, 1] -= min(self.position[:, 1])
            self.position = self.position / amax(self.position)
            if max(self.position[:, 0]) == 1:
                self.position[:, 1] += (1 - max(self.position[:, 1])) / 2
            else:
                self.position[:, 0] += (1 - max(self.position[:, 0])) / 2

        except:
            print("Position file of " + name + " model absent or corrupted")
            self.position = zeros((300, 2))

        try:
            self.app = recfromtxt(options.MODEL_PATH + self.name + "_app.csv")
        except:
            print("Appariment file of " + name + " model absent or corrupted")
            self.app = zeros(300)

        def __str__(self):
            '''String representaion of objects'''
            return (self.name + " model")
Esempio n. 38
0
def readinput(path):
    fasta = Fasta(path)
    seq = fasta.get_segments()[0].get_sequence()
    return seq
#!/usr/bin/env python

from fasta import Fasta
import sys
import os

if len(sys.argv) == 1 or sys.argv[1] in ('-h', '--help'):
    print "Usage: %s fasta_file..." % sys.argv[0]
    sys.exit(0)

for filename in sys.argv[1:]:
    dirname, basename = os.path.split(filename)
    root, ext = os.path.splitext(basename)
    output = root + '-prefixed' + ext

    f = Fasta()

    try:
        f.read_from(filename)
    except IOError, ioe:
        print "Error:", str(ioe)
        continue

    for seq in f:
        seq.header = root + '_' + seq.header
    f.save_to(output)
    if seq2 == None:
        return seq1
    if len(seq1[1]) > len(seq2[1]):
        return seq1
    else:
        return seq2


##################
# BEGIN PROGRAM :)
##################

verify_inputs()
input_file = sys.argv[1]

fasta = Fasta()
file = open(input_file, "r")
sys.stderr.write("Reading fasta ...")
fasta.read(file)
sys.stderr.write("Done.\n")

current_seq = None

for seq in fasta.entries:
    if not same_comp_gene(seq, current_seq):
        if current_seq != None:
            write_seq(current_seq)
        current_seq = seq
    else:
        current_seq = longer_of_the_two(seq, current_seq)