Exemple #1
0
        else:
            print("No sequence for: " + rec.query_name)
        #print("rpos: " + str(rpos) + " qpos: " + str(qpos))
    return(seqs)
         
#sys.exit()
    

# Logfile
loghandle = None
if args.logfile:
    loghandle = open(args.logfile,"w+")

print("Nr. of Contigs: " + str(len(contigs)))

scafs = Longreads(args.inputfiles, blacklist, args.linename)
#scafs.filter_low_quality_contigs(0.76)
scafs.turn_longreads_around(logging=loghandle)
scafs.sort_by_starts()
scafs.filter_contigs_by_coverage(0.5,ignore_ends=True, verbose = False)

print("Nr. of reads: " + str(len(scafs.lreads)))

# Parse Recipe File
needed_reads = set()
items = []
with open(args.contigstringfile) as f:
    for line in f:
        if line.startswith("#"):
            continue
        if len(line.split()) == 2:
parser.add_argument("--blacklistfile", help="File containing long read ids where certain contig mappings should be ignored.")
args = parser.parse_args()

blacklist = defaultdict(list)
if args.blacklistfile:
    with open(args.blacklistfile) as f:
        for line in f:
            sline = line.split()
            if sline[0] == "contig":
                blacklist[sline[1]] = "y"
            else:
                blacklist[sline[0]].append(sline[1])


#print(set([args.lrid]))
scafs = Longreads(args.inputfiles, blacklist, args.linename, whitelist_lreads=set([args.lrid]))
scafs.turn_longreads_around()
scafs.sort_by_starts()
#print(scafs.lreads)

lrseqs = dict()
for read in SeqIO.parse(args.sequencefile, "fastq"):
    lrseqs[read.id] = str(read.seq)

ctg1, ctg2 = args.contigs.split("-")
read = scafs.lreads[args.lrid]
sc = 0
ec = 0
for ctg in read["maps"]:
    if ctg["name"] == ctg1:
        #print("\t".join([ctg["name"], str(ctg["scr"]), str(ctg["ecr"]), str(ctg["scc"]), str(ctg["ecc"])]))
Exemple #3
0
                    sccs.append(ctgs[ctgn][ctgidx]["scc"])
                    eccs.append(ctgs[ctgn][ctgidx]["ecc"])
                    strands.append(ctgs[ctgn][ctgidx]["strand"])
                newctg = {
                    "strand": round(mean(strands)),
                    "name": ctgn,
                    "scc": round(mean(sccs)),
                    "ecc": round(mean(eccs)),
                    "scr": round(mean(cluster)) + round(mean(sccs)),
                    "ecr": round(mean(cluster)) + round(mean(eccs))
                }
                p["maps"].append(newctg)
    return pseudolongreads


scafs = Longreads(args.inputfiles, blacklist, args.linename)
#scafs.filter_whitelist_ctgs(set(["1115APD"]))
scafs.filter_small_contigs(300)
scafs.filter_reverse_small_contigs(600)
scafs.filter_low_quality_contigs(0.81)
scafs.turn_longreads_around()
scafs.sort_by_starts()
scafs.filter_contigcounts(args.mincontigs)
#scafs.copy()

print("Nr. of reads: " + str(len(scafs.lreads)))

status = 0
for iteration in range(10):
    print("Pseudoaligning all... ", end="")
    lr_scores, lr_dists = scafs.pseudoalign_all()
    for line in f:
        if line.startswith(">"):
            break
        else:
            format = "paf"
            celllinestr = line.split()[0]
            m = re.search('[A-Z]+', celllinestr)
            cellline = m.group(0)
            print("cell line detected: " + cellline)
            break

if format == "fasta":
    for read in SeqIO.parse(args.inputfile, "fasta"):
        seqs[read.id] = str(read.seq).upper()
elif format == "paf":
    scaf = Longreads.init_from_reverse_paf(args.inputfile)
    scaf.sort_by_starts()
    lread = scaf.lreads

else:
    print("Problem! Format unknown")
    sys.exit()

image = LongReadSVG(args.output, zoom=800)
dwg = image.dwg

ypad = 7
xpad = 20
col1 = "black"
col2 = "lightgrey"
col = col1
whitelist_ctgs = set()
whitelist_lreads = set()
if (args.whitelist):
    with open(args.whitelist) as f:
        for line in f:
            whitelist_ctgs.add(line.strip())
elif (args.whitelist_lrs):
    with open(args.whitelist_lrs) as f:
        for line in f:
            whitelist_lreads.add(line.strip())

contigs = {}
for read in SeqIO.parse(args.contigfile, "fasta"):
    contigs[read.id] = len(read.seq)

scafs = Longreads(args.inputfiles, blacklist, args.linename, whitelist_lreads)
if whitelist_ctgs:
    scafs.filter_whitelist_ctgs(whitelist_ctgs)
scafs.filter_contigcounts(int(args.mincontigs))
reverse_mappers = set()
reverse_mappers.add("344DBB")
reverse_mappers.add("472DBB")
scafs.turn_longreads_around(reverse_mappers)
scafs.sort_by_starts()
#scafs.filter_small_contigs(300)
#scafs.filter_overlapped_contigs(0.5)
scafs.filter_contigcounts(int(args.mincontigs))
scafs.print_ids()

print("Reads meeting criteria: " + str(len(scafs.lreads)))
print("Pseudoaligning all...")
Exemple #6
0
for read in SeqIO.parse(args.contigfile, "fasta"):
    contigs[read.id] = len(read.seq)

print("Nr. of scaffolds: " + str(len(contigs)))

blacklist = defaultdict(list)
if args.blacklistfile:
    with open(args.blacklistfile) as f:
        for line in f:
            sline = line.split()
            if sline[0] == "contig":
                blacklist[sline[1]] = "y"
            else:
                blacklist[sline[0]].append(sline[1])

lrs = Longreads(args.inputfiles, blacklist, args.linename)
lrs.filter_contigcounts(2)
lrs.turn_longreads_around()
lrs.sort_contigs_in_reads()
greads = lrs.lreads

# cluster np-reads
print("scaffolding long reads ....")
contig2cluster = {}
creads = {}
clusternr = 0
while len(greads) > 0:
    clusternr += 1
    current_cluster = {}
    current_contigs = set()
    # take a random read and build a cluster from it
Exemple #7
0
reads = {}
cgreads = []

blacklist = defaultdict(list)
if args.blacklistfile:
    with open(args.blacklistfile) as f:
        for line in f:
            sline = line.split()
            if sline[0] == "contig":
                blacklist[sline[1]] = "y"
            else:
                blacklist[sline[0]].append(sline[1])


lrs = Longreads(args.inputfiles, blacklist, args.linename)
lrs.filter_contigcounts(2)
lrs.filter_small_contigs(300)
#lrs.filter_reverse_small_contigs(600)

reverse_mappers = set()
reverse_mappers.add("344DBB")
reverse_mappers.add("472DBB")
lrs.turn_longreads_around(reverse_mappers)
lrs.sort_by_starts()

contig2lrid = lrs.ctg2lreads

def get_full_name(short_ctgn):
    if "_" in short_ctgn:
        nr = short_ctgn.split("_")[0]