Ejemplo n.º 1
0
def by_genome(x, args):
    """All processing conducted per genome.

    Parameters
    ----------
    x : list
        [inFile,taxonName]
        inFile -- genome sequence file name
        taxonName -- taxon name of genome
    args : dict
       user-provided args 

    Returns
    -------
    l2d -- list of lists
        for each fragment: [taxonName,scaf,start,end,GC]
    """
    taxonName,inFile = x
    # status
    sys.stderr.write('Processing: "{}"\n'.format(taxonName))

    # making genome object
    assert '--fr' in args, '"--fr" must be provided in args'
    genome = Genome(inFile, taxonName, args['--fr'])
    
    # MFEprimer.py executable
    MFEprimerExe = args['--MFE']
    
    # sequenced read template location: amplicons
    if genome.primerFile is not None:
        # in-silico PCR
        assert '--rtr' in args, '"--rtr" must be in args'
        genome.callMFEprimer(rtr=args['--rtr'], MFEprimerExe=MFEprimerExe)
    
        # filtering overlapping in-silico amplicons
        genome.filterOverlaps()
                
    # simulating fragments    
    simFO = SimFrags(fld=args['--fld'], flr=args['--flr'], rtl=args['--rtl'])
    nFragsMade = 0
    fragList = dict()
    ## if no amplicons
    if genome.nAmplicons == 0:
        pass
    ## if using coverage
    elif args['--nf'].endswith('X') or args['--nf'].endswith('x'):
        coverage = float(args['--nf'].rstrip('xX'))
        fragLenCov = genome.length * coverage
        fragLenTotal = 0
        while 1:
            (scaf,fragStart,fragLen,fragGC) = simFO.simFrag(genome)
            try:
                type(fragList[scaf])
            except KeyError:
                fragList[scaf] = []
                                
            if fragStart == "NA":
                break
            elif fragLenTotal > fragLenCov:
                break
            fragLenTotal += fragLen 

            nFragsMade += 1
            fragList[scaf].append([fragStart, fragLen, fragGC])            
    ## if using fixed number of fragments
    else:            
        for i in xrange(int(args['--nf'])):
            (scaf,fragStart,fragLen,fragGC) = simFO.simFrag(genome)

            try:
                type(fragList[scaf])
            except KeyError:
                fragList[scaf] = []

            if fragStart == "NA":
                break

            nFragsMade += 1
            fragList[scaf].append([fragStart, fragLen, fragGC])
                
    # status
    sys.stderr.write('  Genome name: {}\n'.format(genome.taxonName))                
    sys.stderr.write('  Genome length (bp): {}\n'.format(genome.length))
    if args['--nf']:
        msg = '  Number of amplicons: {}\n'
        sys.stderr.write(msg.format(genome.nAmplicons))
    msg = '  Number of fragments simulated: {}\n'
    sys.stderr.write(msg.format(nFragsMade))
                
    return [genome.taxonName, fragList]