def read_te_fasta(self, fasta_file):

        from fastaIO import FastaReader
        seq_dict1 = {}
        seq_dict2 = {}

        ct = 1
        handle = FastaReader(fasta_file)
        for seq_id, seq in handle:
            seq_dict1[ct] = seq_id
            seq_dict2[ct] = seq
            ct = ct + 1

        handle.close()

        return seq_dict1, seq_dict2
Beispiel #2
0
def getPopGenomeStats(inputFile):
    totleng = 0
    totcount = 0
    for head, seq in FastaReader(inputFile):
        totcount += 1
        totleng += len(seq)
    averagegenomesize = float(totleng) / float(totcount)
    popgenomesamples = totcount

    return averagegenomesize, popgenomesamples
    def read_chesis_fasta(self, fasta_file):

        from fastaIO import FastaReader

        seq_dict = {}

        ct = 1

        handle = FastaReader(fasta_file)

        seq_id = ""
        seq_length = 0
        chr_seq = ""
        for seq_id, seq in handle:
            seq_length = len(seq)
            chr_seq = seq

        handle.close()

        return seq_id, seq_length, chr_seq
Beispiel #4
0
def load_chasis(inputFile):
    fr = FastaReader(inputFile)
    counter = 0
    h = None
    s = None

    for header, seq in fr:
        h = header
        s = seq
        counter += 1
    if counter > 1:
        raise ValueError(
            "Chasis file must only contain a single reference genome")
    return h, s
Beispiel #5
0
			end=te.end
			seq=maskSeq(seq,start,end)		# mask the TE with Ns
		novelrefseq[chr]=seq
	return novelrefseq



def printSequences(seq,outfasta):
	fw=FastaWriter(outfasta,60)
	for n,s in seq.items():
		fw.write(n,s)
	fw.close()
	
	

parser = OptionParser()
parser.add_option("--gtf",dest="gtfte",help="A gtf file containing the TE annotation")
parser.add_option("--input",dest="fastaref",help="A fasta file containing the reference sequence")
parser.add_option("--output",dest="outfasta",help="The output of the fasta sequences"),


(options, args) = parser.parse_args()
print("Loading refseqs..")
refseqs = FastaReader.readFastaHash(options.fastaref)
print("Loading gtf..")
noveltegtf= GTFTEReader.readall(options.gtfte)
print("Masking reference sequence..")
novelrefseq=maskTEsinSeq(noveltegtf,refseqs)
print("Printing masked reference sequence..")
printSequences(novelrefseq,options.outfasta)
Beispiel #6
0
    return novelrefseq


def printSequences(seq, outfasta):
    fw = FastaWriter(outfasta, 60)
    for n, s in seq.items():
        fw.write(n, s)
    fw.close()


parser = OptionParser()
parser.add_option("--gtf",
                  dest="gtfte",
                  help="A gtf file containing the TE annotation")
parser.add_option("--input",
                  dest="fastaref",
                  help="A fasta file containing the reference sequence")
parser.add_option("--output",
                  dest="outfasta",
                  help="The output of the fasta sequences"),

(options, args) = parser.parse_args()
print("Loading refseqs..")
refseqs = FastaReader.readFastaHash(options.fastaref)
print("Loading gtf..")
noveltegtf = GTFTEReader.readall(options.gtfte)
print("Masking reference sequence..")
novelrefseq = maskTEsinSeq(noveltegtf, refseqs)
print("Printing masked reference sequence..")
printSequences(novelrefseq, options.outfasta)


 
    
parser.add_argument("--chassis", type=str, required=False, dest="ref_fasta", default=None, help="the chassis, i.e. the sequence into which TEs will be inserted; a fasta file")
parser.add_argument("--te-seqs", type=str, required=False, dest="te_fasta", default=None, help="TE sequences in a fasta file")
parser.add_argument("--pgd", type=str, required=True, dest="pgd_definition", default=None, help="the definition of the population genome")
parser.add_argument("--output", type=str, required=True, dest="output", default=None, help="the output file; will be multi-fasta file")

args = parser.parse_args()

# read TE sequences from file; if provided
tetuples=[]
if args.te_fasta is not None:
     tmp=FastaReader.readAllTuples(args.te_fasta)
     tetuples=[t[1] for t in tmp]
     print "Loading TE sequences; Found {0} in file {1}".format(len(tetuples),args.te_fasta)
sc=SequenceContainer(tetuples)

# read the PGD; must be provided
print "Loading population genome defintion"
pgdr=PopGenDefinitionReader(args.pgd_definition,sc)
tedeftuples=pgdr.read_transposed()
print "Found {0} TE defintions".format(sc.get_count_definitions())
print "Will simulate {0} TE insertion sites within a population having {1} haploid genomes".format(pgdr.insertions, pgdr.popsize)

# load chasis from the file; if provided otherwise from the PGD; not both though
chasis=""
if args.ref_fasta is not None:
     if pgdr.get_chasis() !="":
Beispiel #8
0
(options, args) = parser.parse_args()

teorder = [
    "1360", "17.6", "1731", "297", "3S18", "412", "accord", "accord2",
    "aurora-element", "baggins", "Bari1", "Bari2", "blood", "BS", "BS3", "BS4",
    "Burdock", "Circe", "copia", "Cr1a", "diver", "diver2", "Dm88", "Doc",
    "Doc2-element", "Doc3-element", "Doc4-element", "F-element", "FB", "flea",
    "frogger", "Fw2", "Fw3", "G-element", "G2", "G3", "G4", "G5", "G5A", "G6",
    "G7", "GATE", "gtwin", "gypsy", "gypsy10", "gypsy11", "gypsy12", "gypsy2",
    "gypsy3", "gypsy4", "gypsy5", "gypsy6", "gypsy7", "gypsy8", "gypsy9", "HB",
    "Helena", "HeT-A", "HMS-Beagle", "HMS-Beagle2", "hobo", "hopper",
    "hopper2", "I-element", "Idefix", "INE-1", "invader1", "invader2",
    "invader3", "invader4", "invader5", "invader6", "Ivk", "jockey", "jockey2",
    "Juan", "looper1", "Mariner", "mariner2", "Max-element", "McClintock",
    "mdg1", "mdg3", "micropia", "NOF", "opus", "Osvaldo", "P-element", "pogo",
    "Porto1", "Q-element", "Quasimodo", "R1-2", "R1A1-element", "R2-element",
    "roo", "rooA", "rover", "Rt1a", "Rt1b", "Rt1c", "S-element", "S2",
    "springer", "Stalker", "Stalker2", "Stalker3", "Stalker4", "Tabor",
    "TAHRE", "Tc1", "Tc1-2", "Tc3", "Tirant", "Tom1", "transib1", "transib2",
    "transib3", "transib4", "Transpac", "X-element", "ZAM"
]

print("Loading refseqs..")
refseqs = FastaReader.readFastaHash(options.teseqs)
f2e = read_famtoentry(options.hier)
for fam in teorder:
    entry = f2e[fam]
    seq = refseqs[entry]
    l = len(seq)
    print "{0}\t{1}".format(fam, l)
                a=l.split("\t")
                entry=a[0]
                fam=a[2]
                ord=a[4]
                fto[fam]=entry
        return fto


parser = OptionParser()
parser.add_option("--input",dest="teseqs",help="The TE seqs")
parser.add_option("--hier",dest="hier",help="the te hierarchy")
(options, args) = parser.parse_args()

teorder=["1360","17.6","1731","297","3S18","412","accord","accord2","aurora-element","baggins","Bari1","Bari2","blood","BS","BS3","BS4","Burdock","Circe","copia","Cr1a","diver","diver2","Dm88","Doc","Doc2-element","Doc3-element","Doc4-element",
	 "F-element","FB","flea","frogger","Fw2","Fw3","G-element","G2","G3","G4","G5","G5A","G6","G7","GATE","gtwin","gypsy","gypsy10","gypsy11","gypsy12","gypsy2","gypsy3","gypsy4","gypsy5",
	 "gypsy6","gypsy7","gypsy8","gypsy9","HB","Helena","HeT-A","HMS-Beagle","HMS-Beagle2","hobo","hopper","hopper2","I-element","Idefix","INE-1","invader1","invader2","invader3","invader4",
	 "invader5","invader6","Ivk","jockey","jockey2","Juan","looper1","Mariner","mariner2","Max-element","McClintock","mdg1","mdg3","micropia","NOF","opus","Osvaldo","P-element","pogo",
	 "Porto1","Q-element","Quasimodo","R1-2","R1A1-element","R2-element","roo","rooA","rover","Rt1a","Rt1b","Rt1c","S-element","S2","springer","Stalker","Stalker2","Stalker3","Stalker4",
	 "Tabor","TAHRE","Tc1","Tc1-2","Tc3","Tirant","Tom1","transib1","transib2","transib3","transib4","Transpac","X-element","ZAM"]



print("Loading refseqs..")
refseqs = FastaReader.readFastaHash(options.teseqs)
f2e=read_famtoentry(options.hier)
for fam in teorder:
        entry=f2e[fam]
        seq=refseqs[entry]
        l=len(seq)
        print "{0}\t{1}".format(fam,l)