Exemple #1
0
def get_longest(contig_file, outfile, min_length):
    #returns a dictionary of header sequence pairs of the longest contig in
    #each locus
    locus_dict = defaultdict(list)
    contig_dict = {}
    for h, s in FastaReader(contig_file):
        if len(h.split("_")) > 1:
            locus_dict[h.split("_")[1]].append((h, s))
        else:
            contig_dict[h] = s

    longest_contigs = []

    for locus in locus_dict:
        max_len = 0
        for h, s in locus_dict[locus]:
            if len(s) > max_len:
                max_len = len(s)
                curr = (h, s)
        longest_contigs.append(curr)

    with open(outfile, 'w') as outfas:
        for contig in longest_contigs:
            if len(contig[1]) >= min_length:
                outfas.write(">" + contig[0] + "\n")
                outfas.write(contig[1] + "\n")
        for contig in contig_dict:
            if len(contig_dict[contig]) >= min_length:
                outfas.write(">" + contig + "\n")
                outfas.write(contig_dict[contig] + "\n")

    return longest_contigs
def split_genome(fastafile, outputprefix, n_splits, n_reps):
    seqs = []
    total_length = 0
    for h, s in FastaReader(fastafile):
        seqs.append((h, s))

    if len(seqs) > 1:
        raise ValueError(
            'Multiple contigs in fasta file which isnt currently supported!')
    h = ""
    s = seqs[0][1]

    print "Genome length:", len(s)

    for r in range(n_reps):
        outputfile = outputprefix + "_rep_" + str(r) + ".fasta"
        # generate random split locations
        cuts = np.random.choice(len(s), n_splits)
        cuts = sorted(np.append(cuts, len(s)))
        print "cuts:", cuts
        prev = 0
        with open(outputfile, 'w') as outfile:
            for i, cut in enumerate(cuts):
                outfile.write(">split_" + str(i) + "\n")
                outfile.write(s[prev:cut] + "\n")
                prev = cut

    print "Final contig end position:", len(s)

    return
Exemple #3
0
def remove_shorter_sequences(fasta_file, per_within_max, len_cutoff
    , outputfile):

    locus_dict = {}

    #Load in fasta file
    for h,s in FastaReader(fasta_file):
        locus_name = h.split("_")[1]
        if locus_name not in locus_dict:
            locus_dict[locus_name]=Locus(h)

        locus_dict[locus_name].transcripts.append(Transcript(h, s))

        if locus_dict[locus_name].max_trans_length < len(s):
            locus_dict[locus_name].max_trans_length = len(s)

    #now output with cleaning
    with open(outputfile,'w') as outfile:
        for locus_name in locus_dict:
            for t in locus_dict[locus_name].transcripts:
                if ((len(t.sequence)/float(locus_dict[locus_name].max_trans_length))
                    < (1-per_within_max)): #too far away from the longest
                    continue
                if len(t.sequence) < len_cutoff:
                    continue
                outfile.write(">"+t.name+"\n")
                outfile.write(t.sequence+"\n")
Exemple #4
0
def get_rask_var(raskFasta, contig_file, fileName, outdir, verbose):

    #run blast against the rask VAR genes
    blast_out = run_blast(raskFasta, contig_file, outdir, verbose)

    rask_hits = set()
    with open(blast_out, 'r') as blastfile:
        for line in blastfile:
            rask_hits.add(line.split()[0])

    rask = outdir + fileName + "_rask.fa"
    non_rask = outdir + fileName + "_nonrask.fa"

    count_rask = 0
    count_non_rask = 0

    with open(rask, 'w') as raskout:
        with open(non_rask, 'w') as nonraskout:
            for h, s in FastaReader(contig_file):
                if h in rask_hits:
                    raskout.write(">" + h + "\n")
                    raskout.write(s + "\n")
                    count_rask += 1
                else:
                    nonraskout.write(">" + h + "\n")
                    nonraskout.write(s + "\n")
                    count_non_rask += 1

    if verbose:
        print count_rask, " annotated to rask DB..."
        print count_non_rask, " remaining"

    return rask, non_rask
Exemple #5
0
def merge_files(easy, hmmMatch, outputdir):

    out_file = (outputdir + os.path.splitext(os.path.basename(easy))[0] +
                "_finalTranslated.fa")

    with open(out_file, 'w') as outfile:
        for h, s in FastaReader(easy):
            outfile.write(">" + h + "\n")
            outfile.write(s + "\n")
        for h, s in FastaReader(hmmMatch):
            outfile.write(">" + h + "\n")
            outfile.write(s + "\n")

    print "Success!"

    return
def translate_6_frame(in_fasta, out_fasta):
    with open(out_fasta, 'w') as outfile:
        for h,s in FastaReader(in_fasta):
            for frame, seq in sixFrameTranslation(s).items():
                outfile.write(">"+h+" _frame_"+str(frame)+"\n")
                outfile.write(seq+"\n")
    return out_fasta
Exemple #7
0
def filter_Locus_1(outputdir, folder_prefix, verbose=False):
    final_transcripts = []
    locus_dict = defaultdict(list)
    for transcript_file in glob.glob(outputdir + folder_prefix +
                                     '*/transcripts.fa'):
        #now need to extract all transcripts that are the only member of
        #their Locus
        name = transcript_file.strip('/transcripts.fa')[-2:]
        print name
        print transcript_file
        for h, s in FastaReader(transcript_file):
            locus = h.split("_")[1] + "_" + name
            locus_dict[locus].append((h + "_K" + name, s))
    #Now we want to write out all Locus' of length 1 to the outfile
    out_fileA = outputdir + "filter_locus1_transcripts_keep61.fa"
    with open(out_fileA, 'w') as outfile:
        for locus in locus_dict:
            if (locus.split("_")[-1] == '61') or (len(locus_dict[locus]) == 1):
                for t in locus_dict[locus]:
                    outfile.write(">" + t[0] + "\n")
                    outfile.write(t[1] + "\n")

    # out_fileB = outputdir + "filter_locus1_transcripts.fa"
    # with open(out_fileB, 'w') as outfile:
    #     for locus in locus_dict:
    #         if len(locus_dict[locus]) == 1:
    #             for t in locus_dict[locus]:
    #                 outfile.write(">" + t[0] + "\n")
    #                 outfile.write(t[1] + "\n")

    return out_fileA
Exemple #8
0
def filter_with_HMMER(orfFile, hmmfile, hmmThresh, outputdir):
    outname = os.path.splitext(os.path.basename(orfFile))[0]

    search_file = searchhmmer(orfFile, hmmfile, 9.97, outputdir, outname, True)

    output_file = (outputdir + os.path.splitext(os.path.basename(orfFile))[0] +
                   "_matchedHMMER.fa")

    # target name        accession   tlen query name           accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
    head_hits = defaultdict(set)
    with open(search_file, 'r') as searchfile:
        for line in searchfile:
            if line[0] == '#':
                continue
            line = line.strip().split()
            sequence = line[0]
            hmmHit = line[3]

            head_hits[sequence].add(hmmHit)

    num_orfs = 0
    num_keep = 0
    with open(output_file, 'w') as outfile:
        for h, s in FastaReader(orfFile):
            num_orfs += 1
            if len(head_hits[h]) >= hmmThresh:
                outfile.write(">" + h + "\n")
                outfile.write(s + "\n")
                num_keep += 1

    print str(num_keep) + " orfs kept out of " + str(num_orfs)

    return output_file
Exemple #9
0
def filterWithHMMER(inputfile, prefix, cpu, verbose):

    hmmer_cmd = (HMMERSEARCH + " -o /dev/null" + " --domT 80" +
                 " --domtblout " + "hmmerDBLalphaSearch.txt" + " --cpu " +
                 str(cpu) + " " + DBLA_HMM + " " + inputfile)

    if verbose:
        print "running... ", hmmer_cmd

    check_call(hmmer_cmd, shell=True)

    #Now run through and get DBLalpha seqs
    keep = set()
    with open("hmmerDBLalphaSearch.txt", 'rU') as infile:
        for line in infile:
            if line[0] == "#": continue
            keep.add(line.split()[0])

    with open(prefix + "_DBLa_cleaned.fasta", 'w') as dblfile:
        with open(prefix + "_NOT_dblalpha.fasta", 'w') as contamfile:
            for h, s in FastaReader(inputfile):
                if h in keep:
                    dblfile.write(">" + h + "\n" + s + "\n")
                else:
                    contamfile.write(">" + h + "\n" + s + "\n")
Exemple #10
0
def trim_contigs(length, contig_file, outdir):
    out_file = outdir + "trim_transcripts.fa"
    with open(out_file, 'w') as outfile:
        for h, s in FastaReader(contig_file):
            if len(s) < length:
                continue
            outfile.write(">" + h + "\n")
            outfile.write(s + "\n")

    return out_file
Exemple #11
0
def reNameContigs(contig_file, fileName, outputdir):
    renamed = outputdir + fileName + "renamed.fa"
    with open(renamed, 'w') as outfile:
        for h, s in FastaReader(contig_file):
            h = h.split()[0]
            if len(h.split("_")) > 4:  #oases transcript
                h = h.split("_")
                h = "_".join([h[0], h[1], h[3]])
            outfile.write(">" + h + "\n")
            outfile.write(s + "\n")
    return renamed
def convert(gfffile, fastafile, outputfile):

    with open(outputfile, 'w') as outfile:
        outfile.write("##gff-version 3\n")
        for h, s in FastaReader(fastafile):
            h = h.split()[0]
            outfile.write(" ".join(["##sequence-region", h, "1",
                                    str(len(s))]) + "\n")

        with open(gfffile, 'rU') as infile:
            for line in infile:
                if line[0] != "#":
                    outfile.write(line)

        outfile.write("##FASTA\n")

        for h, s in FastaReader(fastafile):
            h = h.split()[0]
            outfile.write(">" + h + "\n" + s + "\n")

    return
Exemple #13
0
def get_contaminants(fasta_ref_files, contig_file, fileName, percent_overlap,
                     outdir, verbose):
    #first get list of contigs
    contigs = {}
    for h, s in FastaReader(contig_file):
        contigs[h] = s

    if verbose:
        print("Number of contigs before contaminant filtering: ",
              len(contigs.keys()))

    #now run blast against the reference files which we want not to be
    #present in the data i.e. human
    blast_files = []
    for reference in fasta_ref_files:
        blast_files.append(run_blast(reference, contig_file, outdir, verbose))

    #now iterate through blast results file removing contigs that have to
    #high a proportion of hits
    bad_contigs = set()
    for blast_file in blast_files:
        blast_name = os.path.splitext(os.path.basename(blast_file))[0]
        with open(blast_file, 'r') as bfile:
            for line in bfile:
                tokens = line.strip().split()
                name = tokens[0]
                overlap = int(tokens[3]) / float(len(contigs[name]))
                if overlap > percent_overlap:
                    #we don't want this contig
                    print "removing", name, "overlapped", blast_name
                    bad_contigs.add(name)

    #now write out a fasta file of contaminant sequences
    contaminant_file = outdir + fileName + "contaminants.fa"
    with open(contaminant_file, 'w') as outfile:
        for contig in bad_contigs:
            outfile.write(">" + contig + "\n")
            outfile.write(contigs[contig] + "\n")

    #now write contigs without contaminants to a file
    non_contaminant_file = outdir + fileName + "Non_contaminants.fa"
    with open(non_contaminant_file, 'w') as outfile:
        for contig in contigs:
            if contig not in bad_contigs:
                outfile.write(">" + contig + "\n")
                outfile.write(contigs[contig] + "\n")

    if verbose:
        print("Number of contigs after filtering: ",
              len(contigs.keys()) - len(bad_contigs))

    return non_contaminant_file, contaminant_file
Exemple #14
0
def combineReadFiles(outputfile, verbose):

    if verbose:
        print "combining sample files..."

    #combine read files appending sample name to read headers
    with open(outputfile, 'w') as outfile:
        for f in glob.glob("*_lowSupportFiltered.fasta"):
            for h, s in FastaReader(f):
                outfile.write(">" + h + "sample=" +
                              f.split("_demultiplex")[0] + "\n")
                outfile.write(s + "\n")

    return
Exemple #15
0
def main_new(fastafile,bkp):
    distance_name=["ab","ac","bc"];
    seq_name=[]    
    for h,s in FastaReader(fastafile):
        seq_name.append(h)
    aln = AlignIO.read(open(fastafile), 'fasta')
    calculator = DistanceCalculator('blosum62')
    segment_1 = calculator.get_distance(aln[:, :bkp])
    segment_2 = calculator.get_distance(aln[:, bkp:])
    distance=[segment_1[seq_name[1]][0],segment_1[seq_name[2]][0],segment_1[seq_name[2]][1],segment_2[seq_name[1]][0],segment_2[seq_name[2]][0],segment_2[seq_name[2]][1]];
    #distance=[segment_1[seq_name[1]][0],segment_1[seq_name[2]][0],segment_1[seq_name[2]][1],segment_2[seq_name[1]][0],segment_2[seq_name[2]][0],segment_2[seq_name[2]][1]];
    compare_distance=[abs(distance[0]-distance[3]),abs(distance[1]-distance[4]),abs(distance[2]-distance[5])]##in order of ab,ac,bc
    temp2 = distance_name[compare_distance.index(min(compare_distance))]
    string = "abc";string = string.replace(temp2[0],"");string = string.replace(temp2[1],"")
    rec=seq_name["abc".index(string)]   
    return rec
Exemple #16
0
def filter_length(contig_file, length_filter, fileName, outdir, verbose):
    length_file = outdir + fileName + "lenFilt.fa"

    short_count = 0
    with open(length_file, 'w') as outfile:
        for h, s in FastaReader(contig_file):
            if len(s) < length_filter:
                short_count += 1
            else:
                outfile.write(">" + h + "\n")
                outfile.write(s + "\n")

    if verbose:
        print short_count, " contigs removed as too short..."

    return length_file
Exemple #17
0
def annotate_w_ntDB(contig_file, fileName, outdir, verbose):

    blastOut = outdir + fileName + "nonRaskBlast.txt"

    #first run a special blast using the nt database
    blast_cmd = (
        "blastn " + "-evalue 10 " +
        """-outfmt "6 qseqid sseqid  stitle length pident qstart qend sstart send evalue" """
        # + "-num_alignments " + str(num_hits) + " "
        + "-num_threads 10 -max_target_seqs 3 " + "-db " + BLAST_NT_DB + " " +
        "-query " + contig_file + " " + "-out " + blastOut)
    if verbose:
        print blast_cmd

    check_call(blast_cmd, shell=True)

    #now retrieve annotation information
    contigs = defaultdict(str)
    contigs_perID = defaultdict(float)
    with open(blastOut, 'r') as blastfile:
        for line in blastfile:
            line = line.strip().split("\t")
            contigs[line[0]] = (contigs[line[0]] + " [" + line[2] +
                                "_alignLen_" + line[3] + "_perID_" + line[4] +
                                "] ")
            contigs_perID[line[0]] = max(contigs_perID[line[0]],
                                         float(line[4]))

    #now re-write the fasta file with the annotations in the headers
    annotated = outdir + fileName + "nonRask_annotated.fa"
    unknown_blastOut = outdir + fileName + "ForManualInspection.fa"

    with open(annotated, 'w') as outfileKnown:
        with open(unknown_blastOut, 'w') as outfileUnknoen:
            for h, s in FastaReader(contig_file):
                if h in contigs:
                    if contigs_perID[h] > 97:
                        outfileKnown.write(">" + h + " " + contigs[h] + "\n")
                        outfileKnown.write(s + "\n")
                        continue
                if h in contigs:
                    outfileUnknoen.write(">" + h + " " + contigs[h] + "\n")
                else:
                    outfileUnknoen.write(">" + h + " none\n")
                outfileUnknoen.write(s + "\n")

    return annotated
Exemple #18
0
def pull_out_long_ORFs(bad_file, len_cutoff, outputdir):

    out_file = (outputdir + os.path.splitext(os.path.basename(bad_file))[0] +
                "_TranslongORFS.fa")

    num_seqs = 0
    num_orfs = 0
    with open(out_file, 'w') as outfile:
        for h, s in FastaReader(bad_file):
            num_seqs += 1
            translation = sixFrameTranslation(s)
            orfs = get_long_ORFS(translation, len_cutoff)
            for o in orfs:
                num_orfs += 1
                outfile.write(">" + h + o[0] + "\n")
                outfile.write(o[1] + "\n")

    print str(num_seqs) + " sequences translated into " + str(
        num_orfs) + " ORFs"

    return out_file
Exemple #19
0
def filter_ref_with_blast(fasta_ref_files, contig_file, percent_overlap,
                          outfile, outdir):

    #first get list of contigs
    contigs = {}
    for h, s in FastaReader(contig_file):
        contigs[h] = s

    print "Number of contigs before filtering: ", len(contigs.keys())

    #now run blast against the reference files which we want not to be
    #present in the data i.e. human
    blast_files = []
    for reference in fasta_ref_files:
        blast_files.append(run_blast(reference, contig_file, outdir, True))

    #now iterate through blast results file removing contigs that have to
    #high a proportion of hits
    bad_contigs = set()
    for blast_file in blast_files:
        blast_name = os.path.splitext(os.path.basename(blast_file))[0]
        with open(blast_file, 'r') as bfile:
            for line in bfile:
                tokens = line.strip().split()
                name = tokens[0]
                overlap = int(tokens[3]) / float(len(contigs[name]))
                if overlap > percent_overlap:
                    #we don't want this contig
                    print "removing", name, "overlapped", blast_name
                    bad_contigs.add(name)
    for name in bad_contigs:
        del contigs[name]

    #now write resulting contigs to a file
    with open(outfile, 'w') as outfas:
        for contig in contigs:
            outfas.write(">" + contig + "\n")
            outfas.write(contigs[contig] + "\n")

    print "Number of contigs after filtering: ", len(contigs.keys())
Exemple #20
0
def split_easy_from_hard(inputfile, outputdir):
    seqCount = 0
    badSeqs = 0
    bad_lengths = []

    output_file = (outputdir +
                   os.path.splitext(os.path.basename(inputfile))[0] +
                   "_translatedL2stops.fa")

    with open(output_file + "_BadSeqs", 'w') as badfile:
        with open(output_file, 'w') as outfile:

            for h, s in FastaReader(inputfile):
                stops = 9999
                translation = sixFrameTranslation(s)

                for frame in translation:
                    st = translation[frame].count('*')
                    if st < stops:
                        best = frame
                        stops = st

                if stops <= 2:
                    outfile.write(">" + h + " frame_" + str(best) + "\n")
                    outfile.write(translation[best] + "\n")
                else:
                    badSeqs += 1
                    bad_lengths.append(len(s))
                    badfile.write(">" + h + "\n")
                    badfile.write(s + "\n")

                seqCount += 1
    print(
        str((100.0 * badSeqs) / seqCount) + "percent or " + str(badSeqs) +
        " out of " + str(seqCount) + " were not translated.")

    return output_file, output_file + "_BadSeqs"
        is_fasta = True
    else:
        is_fasta = False

    if uproc:
        uprocOut = run_uproc(outdir+prefix+"_uprocList.csv", read1, read2)

        hmmerIn = process_uproc_results(uprocOut, outdir+prefix+"_UprocReads.fa"
            , read1, read2)
        # hmmerIn = outdir+prefix+"_UprocReads.fa"
    else:
        hmmerIn = outdir+prefix+"_NoFilterList.fa"
        with open(hmmerIn, 'w') as outfile:
            if not is_fasta:
                if read2:
                    for h,s in FastaReader(read2):
                        outfile.write(">"+h+"\n"+s+"\n")
                for h,s in FastaReader(read1):
                    outfile.write(">"+h+"\n"+s+"\n")
            else:
                if read2:
                    for h,s,q in FastqReader(read2):
                        outfile.write(">"+h+"\n"+s+"\n")
                for h,s,q in FastqReader(read1):
                    outfile.write(">"+h+"\n"+s+"\n")


    uproc_reads_6frame = translate_6_frame(hmmerIn
        , outdir+prefix+"_Uproc_6frame.fa")
    hmm_out = allocate_w_hmmer(uproc_reads_6frame
        , outdir+prefix+"_nhmmOut.txt", evalue)
Exemple #22
0

input=dir+"complement_chunks"
output=dir+"results/result_all.csv"
headers=["chunk","target","db1","db2","rec","sv"]
with open(output, 'a+') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(headers)
for fasta_file in glob.glob(input+"/*.fasta"):
    bkp=int(fasta_file.split(".fasta")[0].split("_")[-1])
    chunk=fasta_file.split("/chunk")[1].split("_")[0]
    initial= main_new(fasta_file,bkp)
    #print initial
    mapping_dict={};seq_name=[] 
    pvalue=0;permutation=100;
    for h,s in FastaReader(fasta_file):
        mapping_dict[h]=s;seq_name.append(h);full_alignment_length=len(s)
    for m in range(permutation):
        index_1 = np.random.choice(bkp, bkp, replace=True);shuffled_sequences_1=[]
        for n in [mapping_dict[seq_name[0]][0:bkp],mapping_dict[seq_name[1]][0:bkp],mapping_dict[seq_name[2]][0:bkp]]:
            temp_list_1="";
            for j in index_1:
                temp_list_1 = temp_list_1+ n[j];
            shuffled_sequences_1.append(temp_list_1)
        s2_length= full_alignment_length-bkp
        index_2 = np.random.choice(s2_length, s2_length, replace=True);shuffled_sequences_2=[]
        for n in [mapping_dict[seq_name[0]][bkp:],mapping_dict[seq_name[1]][bkp:],mapping_dict[seq_name[2]][bkp:]]:
            temp_list_2="";
            for j in index_2:
                temp_list_2 = temp_list_2+ n[j];
            shuffled_sequences_2.append(temp_list_2)
Exemple #23
0
import sys, os
from third_party_runners import run_blast
from mungo.fasta import FastaReader

contig_file = sys.argv[1]
blastdb_file = sys.argv[2]
outdir = sys.argv[3]

name = os.path.splitext(os.path.basename(contig_file))[0]

blast_out = run_blast(blastdb_file, contig_file, outdir, True)

rask_hits = set()
with open(blast_out, 'r') as blastfile:
    for line in blastfile:
        rask_hits.add(line.split()[0])

with open(outdir + name + "_rask.fa", 'w') as raskout:
    with open(outdir + name + "_nonrask.fa", 'w') as nonraskout:
        for h, s in FastaReader(contig_file):
            if h in rask_hits:
                raskout.write(">" + h + "\n")
                raskout.write(s + "\n")
            else:
                nonraskout.write(">" + h + "\n")
                nonraskout.write(s + "\n")
            else:
                with open(
                        dir + "temp/chunk_" + str(i) +
                        "/original_chunk_db.txt", "rU") as db_infile:
                    for n, db_line in enumerate(db_infile.readlines()):
                        line1 = db_line.strip().split()
                        h1 = line1[0]
                        s1 = line1[1]
                        if n == k:
                            outfile.write(">" + h + "\n" +
                                          s[bkp[k - 1]:bkp[k]] + "\n" + ">" +
                                          h1 + "\n" + s1 + "\n")

### Step3: add the part for mafft alignment. use replace not split.
mapping_dict_seq_identifier = {}
for h, s in FastaReader(dir + "data/simulated_seqs_recombined.fasta"):
    mapping_dict_seq_identifier[h] = s
for i in range(Chunk_count):
    db_line_index = range(startline[i] + 2, endline[i] + 1)
    line_index = [startline[i]] + range(startline[i] + 2, endline[i] + 1)
    for k in range(mosaic_output_dbcount[i] - 1):
        with open(input, 'rU') as infile:
            for j, line in enumerate(infile.readlines()):
                if j == db_line_index[k]:
                    line1 = line.strip().split()
                    h1 = line1[0]
                    s1 = line1[1]
                    if len(s1) >= 10:
                        original_seq = mapping_dict_seq_identifier[h1]
                        s = line1[1].replace("-", "")
                        with open(
Exemple #25
0
def countReads(fastafile):
    count = 0
    for h, s in FastaReader(fastafile):
        count += 1
    return count
Exemple #26
0
from third_party_runners import run_blast
from mungo.fasta import FastaReader

OVERLAP = 0.7
IDENTITY = 95

sequences = sys.argv[1]
outdir = sys.argv[2]


#first run all-vs-all blast
bfile = run_blast(sequences, sequences, outdir, True)

#now get the lengths of all the contigs
contig_len = {}
for h,s in FastaReader(sequences):
    contig_len[h]=len(s)

#now iterate through search finding redundant contigs
redundant_contigs = set()
with open(bfile, 'r') as blastsearch:
    for line in blastsearch:
        tokens = line.strip().split()
        if tokens[0]==tokens[1]: #matching itself
            continue
        if float(tokens[2]) >= IDENTITY:
            # print "identity ",tokens[2]
            if ((float(tokens[3])/contig_len[tokens[0]] >= OVERLAP) or 
                (float(tokens[3])/contig_len[tokens[1]] >= OVERLAP)):
                # print "overlap", max(float(tokens[3])/contig_len[tokens[0]],float(tokens[3])/contig_len[tokens[1]])
                #we want to remove the shorted contig
Exemple #27
0
def assemble_paired_reads_soapDeNovoTrans(fasta_single,
                                          fasta_paired,
                                          outputdir,
                                          ins_length=False,
                                          verbose=False):

    #first move in temp directory so the output is kept nicely
    curr_dir = os.getcwd()
    os.chdir(outputdir)

    scriptPath = getScriptPath()

    if fasta_paired == None:
        config_str = ("""#maximal read length
max_rd_len=250
[LIB]
#maximal read length in this lib
rd_len_cutof=250
#average insert size
avg_ins=0
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=32
#fasta file for single reads
q=""" + fasta_single)
    elif (fasta_single == None) or (os.stat(fasta_single).st_size
                                    == 0):  #check if single assembly
        script = "python " + scriptPath + "/third-party/khmer/scripts/split-paired-reads.py"
        #first split files so PEAR can use them
        split_cmd = (script + " " + fasta_paired)

        if verbose:
            print split_cmd
        check_call(split_cmd, shell=True)

        config_str = ("""#maximal read length
max_rd_len=250
[LIB]
#maximal read length in this lib
rd_len_cutof=250
#average insert size
avg_ins=0
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=32
#fasta file for single reads
q1=""" + fasta_paired + ".1\n" + "q2=" + fasta_paired + ".2")
    else:
        script = "python " + scriptPath + "/third-party/khmer/scripts/split-paired-reads.py"
        #first split files so PEAR can use them
        split_cmd = (script + " " + fasta_paired)

        if verbose:
            print split_cmd
        check_call(split_cmd, shell=True)

        config_str = ("""#maximal read length
max_rd_len=250
[LIB]
#maximal read length in this lib
rd_len_cutof=250
#average insert size
avg_ins=0
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=32
#fasta file for single reads
q1=""" + fasta_paired + ".1\n" + "q2=" + fasta_paired + ".2\n" + "q=" +
                      fasta_single)

    scriptPath = getScriptPath()
    script = (
        scriptPath +
        "/third-party/SOAPdenovo-Trans-bin-v1.03/SOAPdenovo-Trans-127mer")

    for K in ["21", "31", "41", "51", "61"]:
        config_file = outputdir + "soapConfigK" + K
        outputGraph = outputdir + "soapGraphK" + K

        soap_cmd = (
            script + " all" + " -s " + config_file + " -o " + outputGraph +
            " -K " + K +
            " -L 50"  #not actually important as we only use the contigs
        )

        with open(config_file, 'w') as outfile:
            outfile.write(config_str)

        if verbose:
            print soap_cmd

        check_call(soap_cmd, shell=True)

    #now to combine the contig files
    outfile = outputdir + "combined_soapDeNovo.fa"

    #now rename transcripts to include which Kmer run they came from and combine
    # into one file
    with open(outfile, 'w') as output:
        for transcript_file in glob.glob("*.contig"):
            fname = os.path.splitext(os.path.basename(transcript_file))[0]
            for h, s in FastaReader(transcript_file):
                h = h.strip().split()
                output.write(
                    (">" + h[0] + "_" + fname + " " + " ".join(h[1:]) + "\n"))
                output.write(s + "\n")

    #return to previous directory
    os.chdir(curr_dir)

    return outfile
Exemple #28
0
def calculateSummaryStatistics(pairedMIDs, outputfile, total_reads,
                               total_reads_before_contaminant_filtering,
                               total_reads_after_contaminant_filtering,
                               blast_isolate_counts, chimeric_filt,
                               cleaned_read_file, verbose):

    ##Calculate flexbar summary statisics
    flexBarStats = readFlexBarLog("flexbarRun1.log")
    logfiles = glob.glob("*flexbarRun2.log")
    for f in logfiles:
        tempLog = readFlexBarLog(f)
        flexBarStats["processed_reads"] += tempLog["processed_reads"]
        flexBarStats["skip_uncalled_bases"] += tempLog["skip_uncalled_bases"]
        flexBarStats["skip_unassigned_reads"] += tempLog[
            "skip_unassigned_reads"]
        flexBarStats["skip_short_reads"] += tempLog["skip_short_reads"]
        flexBarStats["skip_single_reads"] += tempLog["skip_single_reads"]
        flexBarStats["discarded_reads"] += tempLog["discarded_reads"]

    ##Calculate pear merging statistics
    #Now merge files based on MID combinations
    pearStats = {}
    for mid in pairedMIDs:
        filenames = glob.glob("*B_" + mid[0] + "_*B_" + mid[1] +
                              "_*_pearOut.log")
        if mid[0] != mid[1]:
            #add other direction
            filenames += glob.glob("*B_" + mid[1] + "_*B_" + mid[0] +
                                   "_*_pearOut.log")
        sample = pairedMIDs[mid]
        if len(filenames) > 0:
            pearStats[sample] = readPearLog(filenames[0])
        else:
            pearStats[sample] = {}
            pearStats[sample]["total"] = 0
            pearStats[sample]["assembled"] = 0
            pearStats[sample]["discarded"] = 0
            pearStats[sample]["not_assembled"] = 0
            print "Did not find", sample, mid
        for f in filenames[1:]:
            tempPear = readPearLog(f)
            pearStats[sample]["total"] += tempPear["total"]
            pearStats[sample]["assembled"] += tempPear["assembled"]
            pearStats[sample]["discarded"] += tempPear["discarded"]
            pearStats[sample]["not_assembled"] += tempPear["not_assembled"]

    ##Calculate support filtering statistics
    #Count number of reads before support filtering
    initalCounts = {}
    for f in glob.glob("*_demultiplexTrimMerged.fasta"):
        sample = f.split("_demultiplexTrimMerged")[0]
        initalCounts[sample] = countReads(f)

    #Count number of chimeric reads
    if chimeric_filt:
        chimericCounts = Counter()
        for f in glob.glob("*_ncdenovo.fasta"):
            sample = f.split("_demultiplexTrimMerged")[0]
            chimericCounts[sample] = countReads(f)
    else:
        for sample in initalCounts:
            chimericCounts[sample] = 0

    #Count number of centroids
    centroidCounts = Counter()
    for f in glob.glob("*_demultiplexTrimMerged_centroids.fasta"):
        sample = f.split("_demultiplexTrimMerged")[0]
        centroidCounts[sample] = countReads(f)

    #Count number of centroids with sufficient support
    supportCentroids = Counter()
    for f in glob.glob("*_demultiplexTrimMerged_lowSupportFiltered.fasta"):
        sample = f.split("_demultiplexTrimMerged")[0]
        supportCentroids[sample] = countReads(f)

    #Count number of reads remaining after contaminant filtering
    contamFilteredCounts = Counter()
    for h, s in FastaReader(cleaned_read_file):
        sample = h.strip().split("sample=")[1]
        contamFilteredCounts[sample] += 1

    ##write to outputfile
    with open(outputfile, 'w') as outfile:
        outfile.write("# Total paired end reads: " + str(total_reads) + "\n")
        outfile.write("# Total reads prior to contaminant filtering: " +
                      str(total_reads_before_contaminant_filtering) + "\n")
        outfile.write("# Total reads after contaminant filtering: " +
                      str(total_reads_after_contaminant_filtering) + "\n")

        outfile.write("#######  FlexBar Summary statistics  #######\n")
        outfile.write("# Total reads processed: " +
                      str(flexBarStats["processed_reads"]) + "\n")
        outfile.write("# Reads skipped due to uncalled bases: " +
                      str(flexBarStats["skip_uncalled_bases"]) + "\n")
        outfile.write("# Unassigned reads: " +
                      str(flexBarStats["skip_unassigned_reads"]) + "\n")
        outfile.write("# Short reads skipped: " +
                      str(flexBarStats["skip_short_reads"]) + "\n")
        outfile.write("# Single reads skipped: " +
                      str(flexBarStats["skip_single_reads"]) + "\n")
        outfile.write("# Total reads discarded: " +
                      str(flexBarStats["discarded_reads"]) + "\n")
        p = 100.0 * (1 - flexBarStats["discarded_reads"] /
                     float(flexBarStats["processed_reads"]))
        outfile.write("# Proportion of reads kept: " + str(p) + "\n")

        outfile.write("#######  Sample specific summary statistics  #######\n")
        #now write out sample specific statistics
        outfile.write(",".join([
            "Sample", "PreMerge", "Merged", "Discarded", "Not Assembled",
            "Filtered", "Chimeric", "Centroids", "Centroids with support",
            "After contaminant filtering", "3D7", "DD2", "HB3"
        ]) + "\n")

        for sample in pearStats:
            outfile.write(",".join([
                sample,
                str(pearStats[sample]["total"]),
                str(pearStats[sample]["assembled"]),
                str(pearStats[sample]["discarded"]),
                str(pearStats[sample]["not_assembled"]),
                str(pearStats[sample]["total"] - initalCounts[sample]),
                str(chimericCounts[sample]),
                str(centroidCounts[sample]),
                str(supportCentroids[sample]),
                str(contamFilteredCounts[sample]),
                str(blast_isolate_counts[sample]["3D7"]),
                str(blast_isolate_counts[sample]["DD2"]),
                str(blast_isolate_counts[sample]["HB3"])
            ]) + "\n")

            for sample in pearStats:
                if pearStats[sample]["total"] == 0:
                    outfile.write("WARNING: Sample " + sample +
                                  " resulted in 0 reads!\n")

    return
Exemple #29
0
#######################################################################

from mungo.fasta import FastaReader
import sys, os
import random
input_par1 = int(sys.argv[1])
input_par2 = int(sys.argv[2])
input_par3 = int(sys.argv[3])
input_par4 = int(sys.argv[4])
input_fasta = sys.argv[5]

seqs = {}
count = 0
seq_length = 0
temp_length = []
for h, s in FastaReader(input_fasta):
    seqs[h] = s
    count += 1
    temp_length.append(len(s))

seq_length = min(
    temp_length
)  ## length should always be the same without considering indel events.

if input_par1 == 0:  ## eg:set 25 jump once seqs and 25 is nonrecombinants, then input_par4==50
    need_seqs_num = 2 * input_par2 + input_par4 - input_par2
    seq_index = random.sample(range(need_seqs_num), k=2 * input_par2)
    with open(input_fasta[:-20] + "simulated_seqs_recombined.fasta",
              'w') as outfile:
        for count in range(input_par2):
            left_source = seq_index[2 * count]
Exemple #30
0
#!/usr/bin/env python

"""
calcATContent.py

Author: Tony Papenfuss
Date: Wed Jul  8 10:18:32 EST 2009

"""

import os, sys
from mungo.fasta import FastaReader


iFilename = sys.argv[1]

nAT = 0
total = 0
for h,s in FastaReader(iFilename):
    nAT += s.count("A") + s.count("T")
    total += len(s)
print "AT content %0.2f" % (100.0*nAT/total)