Esempio n. 1
0
parser.add_argument('-o', help='Output JSON bead file.')
args = parser.parse_args()

inputFile = args.i
maxee = args.maxee
outputFile = args.o

mockRaw = []
with open(inputFile, 'r') as f:
    for line in f:
        mockRaw.append(json.loads(line))
print('Find {0} bead in the file'.format(len(mockRaw)))
#%% For each bead, remove low quality read and duplicated reads
# Then write to a new JSON file
i = 0
for item in mockRaw:
    beadQC = bead.maxEE(item, maxee=maxee)
    beadDerep = bead.derep(beadQC)
    beadProcessed = bead.beadSequence(beadDerep)
    if len(beadProcessed.fragments) > 0:  # Only save bead with fragments left.
        i += 1
        beadProcessed.jsonWrite(outputFile, mode='a')
print('{0} beads pass the QC and derep'.format(i))
#%% The JSON file can be read in by line
# A single line can be converted to a bead Class
'''
beadList = []
with open(outputFile, 'r') as f:
    for line in f:
        beadList.append(bead.beadSequence(json.loads(line)))
'''
Esempio n. 2
0
parser = argparse.ArgumentParser()
parser.add_argument('-i', help='Input beadJson file')
parser.add_argument('-o', help='Output short fragment count per bead')
parser.add_argument('-d', help='Output distribution of count')
args = parser.parse_args()

inputFile = args.i
outputFile = args.o
outputDist = args.d

count = {}
dist = {}
with open(inputFile, 'r') as f:
    for line in f:
        currentBead = bead.beadSequence(json.loads(line.strip('\n')))
        fragmentCount = len(currentBead.fragments)
        count[currentBead.barcode] = fragmentCount
        try:
            dist[fragmentCount] += 1
        except KeyError:
            dist[fragmentCount] = 1

# Output
with open(outputDist, 'w') as f:
    f.write('{0}\t{1}\n'.format('FragmentCount', 'Frequency'))
    for key, value in dist.items():
        f.write('{0}\t{1}\n'.format(str(key), str(value)))

with open(outputFile, 'w') as f:
    f.write('{0}\t{1}\n'.format('Barcode', 'FragmentCount'))
Esempio n. 3
0
parser.add_argument('-k', default=21, type=int, help='Kmer size default = 21')
args = parser.parse_args()

inputFile = args.i
outputRaw = args.rawout
outputThreshold = args.tout
threshold = args.threshold
kmerSize = args.k

# Read in JSON-Bead file
#Calculate kmer pools for all beads
kmerPool = []
beadCount = 0
with open(inputFile, 'r') as f:
    for line in f:
        b = bead.beadSequence(json.loads(line))
        kmerPool.append(kmer.kmerCount(b, kmerSize))
        beadCount += 1
print('Found {0} beads.'.format(beadCount))

# Calculate kmer distance for all pairs
edge = []
edgeThreshold = []
n1 = 0
n2 = 0
for pair in combinations(kmerPool, 2):
    D = kmer.kmerDistance((pair[0].set, pair[1].set)).mashDistance()
    edge.append((pair[0].barcode, pair[1].barcode, D))
    if threshold[0] <= D <= threshold[1]:
        edgeThreshold.append((pair[0].barcode, pair[1].barcode, D))
        n2 += 1
def main():
    # Read in JSON-Bead file
    #Calculate kmer pools for all beads
    kmerPool = []
    beadCount = 0
    with open(inputFile, 'r') as f:
        for line in f:
            b = bead.beadSequence(json.loads(line))
            kmerPool.append(kmer.kmerCount(b, kmerSize))
            beadCount += 1
    print('Found {0} beads.'.format(beadCount))

    # Setup the parallel enviroment
    # Create shared list for store edge list and progress counter
    manager = Manager()
    edge = manager.list([[]] * job)  # n list for edge list
    count = manager.list([0] * job)  # n list for count

    print('Starting mash distance ...')

    # Divide the kmer pair pool
    pairPool = []
    for pair in combinations(kmerPool, 2):
        pairPool.append(pair)
    size = len(pairPool)
    print('Total is {0} pairs.'.format(size))
    step = size // job
    print('Step is {0}'.format(step))
    start = 0

    workers = []
    print(len(pairPool))
    for i in range(job):
        if i + 1 < job:  # not the last job
            workers.append(
                Process(target=kmerDistanceWorker,
                        args=(pairPool[start:start + step], edge, i, count)))
            start += step
            print('Start change to {0}'.format(start))
        else:
            workers.append(
                Process(target=kmerDistanceWorker,
                        args=(pairPool[start:], edge, i, count)))

    print('Starting %i jobs ...' % job)
    count_worker = 1
    for j in workers:
        j.start()
        print('Starting thread No. %i ...' % count_worker)
        count_worker += 1

    job_alive = True
    while job_alive:
        time.sleep(0.01)
        job_alive = False
        for j in workers:
            if j.is_alive():
                job_alive = True
        progress = str(sum(count) / size * 100) + "\r"
        sys.stderr.write(progress)
        #print(len(edge[0]))

    for j in workers:
        j.join()
    print('Finished dereplicating.')

    with open(outputRaw, 'w') as f:
        f.write('Source\tTarget\tDistance\n')
        for item in edge:
            for line in item:
                f.write('{0}\t{1}\t{2}\n'.format(line[0], line[1], line[2]))
Esempio n. 5
0
        if item[2] > 0.02:
            f.write('{0}\t{1}\t{2}\n'.format(item[0], item[1], item[2]))
'''

#%%
''' Extract bead sequences by module number '''
from metaSeq import io as seqIO
from metaSeq import bead

module = {}
with open('kmer.jcd.0.02.module.txt', 'r') as f:
    f.readline()
    for line in f:
        line = line.strip('\n').split('\t')
        module[line[0]] = line[1]
print(len(module))

cluster = {}

for item in list(set(module.values())):
    cluster[item] = []
print(len(cluster))
beads = seqIO.beadJson('CL100077200_L01.json')
for item in beads:
    b = bead.beadSequence(item)
    classNumber = module.get(b.barcode, False)
    if classNumber:
        cluster[classNumber] += b.fastaSequences()
print(len(cluster))
for key, value in cluster.items():
    seqIO.write_seqs(value, 'cluster/{0}.fa'.format(key), fastx='a', mode='w')
Esempio n. 6
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 30 16:47:24 2018

@author: Zewei Song
@email: [email protected]
"""
#%%
from __future__ import print_function
from __future__ import division
import json
from metaSeq import io as seqIO
from metaSeq import qc as seqQC
from metaSeq import bead

# Read in the JSON bead file
inputFile = 'mock.qc.derep.json'
beadList = []
with open(inputFile, 'r') as f:
    for line in f:
        beadList.append(bead.beadSequence(json.loads(line)))
#%%
# Write the first 10 bead into FASTA file
i = 1
threshold = 100
for line in beadList:
    #outputFile = line.barcode + '.fa'
    line.fastaWrite()
    i += 1
    if i > 10:
        break
Esempio n. 7
0
                                        [email protected]
                                        [email protected]
                                        ------------------------'''))
parser.add_argument('-i', help='merged fasta file.')
parser.add_argument('-d', help='Output directory.')
parser.add_argument(
    '-z', help='save fa with missing tag(0000) into a seperated directory.')

args = parser.parse_args()
#%% Try to sample bead with more than 1 fragments
inputBeadJson = args.i

beadPool = []
with open(inputBeadJson, 'r') as f:
    for line in f:
        beadPool.append(bead.beadSequence(json.loads(line)))
print('Find {0} beads.'.format(len(beadPool)))

#%% Get beads with more than 1 fragment
beadPool_1 = []
for item in beadPool:
    if len(item.fragments) > 1:
        beadPool_1.append(item)
print('Find {0} bead with more than 1 fragments.'.format(len(beadPool_1)))

#%% Write these bead to FASTA files
outputFolder = args.d
code0Folder = args.z

for item in beadPool_1:
    if re.match("0000", item.barcode, 0):