Esempio n. 1
0
from runutils import read_run_details
import os
import sys

runs = read_run_details(sys.argv[1])
for run in runs:
    print run['RunID']

    if run['Tech'] == "454":
        filetype = 'sff'
    elif run['Tech'] == "MiSeq":
        filetype = 'fastq-illumina'
    elif run['Tech'] == "Ion Torrent":
        filetype = 'fastq'
    else:
        print "unsupported format"
        raise SystemExit

    title = "%s - %s - Quality Scores" % (run['RunID'], run['Tech'])

    cmd = "python qual.py reads/%s %s \"%s\" > images/%s_qual.png" % (
        run['Filename'], filetype, title, run['RunID'])

    os.system(cmd)
import pysam
import sys
from runutils import read_run_details
from Bio import SeqIO

reference = dict([(rec.id, rec) for rec in SeqIO.parse(sys.argv[2], "fasta")])

def has_masked(s):
        return len([c for c in s if c.islower()])

MINIMUM_MAPPING_QUALITY = 1

print "sample\tref\trid\tmapped\tmapq\tinsertions\tl_insertions\tdeletions\tl_deletions\trlen"

samples = read_run_details(sys.argv[1])
for sample in samples:
	mapped = 0
	unmapped = 0

	samfile = pysam.Samfile(sample['Path'], "rb")
	id = 1
	for read in samfile:
		if read.is_unmapped or \
		read.mapq < MINIMUM_MAPPING_QUALITY or \
		has_masked(str(reference[samfile.getrname(read.tid)][read.pos : read.pos + read.alen].seq)):
			unmapped += 1

			print "%s\t%s\t%s\t0\t0\t0\t0\t0\t0\t%s" % (sample['Description'], sample['Reference'], id, read.qlen)
		else:
			mapped += 1
    return Stats(contig_lengths, n_vals)

def stats(seq_name, fh, fmt):
    h = get_stats(seq_name, fh, fmt)
    print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (len(h.contig_lengths),
                                      min(h.contig_lengths),
                                      max(h.contig_lengths),
                                      sum(h.contig_lengths),
                                      sum(h.contig_lengths) / len(h.contig_lengths),
                                      h.n_vals[0.5],
                                      h.n_vals[0.75],
                                      h.n_vals[0.9])

if __name__ == "__main__":
    x = read_run_details(sys.argv[1])
    try:
        filter = sys.argv[2]
    except:
        filter = None

    print "Centre\tRunID\tStrain\tTech\tNotes\tReads\tMin\tMax\tSum\tAvg\tN50\tN75\tN90"
    for r in x:
        if filter and filter != r['RunID']:
            continue

        print "%s\t%s\t%s\t%s\t%s\t" % (r['Centre'], r['RunID'], r['Strain'], r['Tech'], r['Notes']),

        if r['Tech'] == '454':
            fmt = 'sff'
        else:
import sys
from runutils import read_assemblies, read_run_details, hashit

assemblies = read_assemblies(sys.argv[1])
summaries = hashit(read_run_details(sys.argv[2]), 'Name')

#Name   NumContigs      NumRefReplicons NumAssemblyBases        NumReferenceBases       NumLCBs DCJ_Distance    NumDCJBlocks    NumSNPs NumMisCalled    NumUnCalled     NumGapsRef      NumGapsAssembly TotalBasesMissed        PercBasesMissed ExtraBases      PercExtraBases  MissingChromosomes      ExtraContigs    NumSharedBoundaries     NumInterLcbBoundaries   BrokenCDS       IntactCDS       ContigN50       ContigN90       MinContigLength MaxContigLength AA      AC      AG      AT      CA      CC      CG      CT      GA      GC      GG      GT      TA      TC      TG      TT

fields = ['NumContigs', 'NumAssemblyBases', 'MaxContigLength', 'ContigN50', 'NumLCBs', 'NumGapsRef', 'NumGapsAssembly', 'PercBasesMissed']

print "Sample" + "\t" + "Assembler" + "\t" + "\t".join(fields)

for a in assemblies:
	try:
		s = summaries[a['Name'] + '.fas']
	except:
		s = summaries[a['Name']]
	print "%s\t%s\t" % (a['Desc'], a['AssemblySoftware']) ,
	print "\t".join([s[f] for f in fields])