def wraphostgenecount(bamfile, tmp_dir, circ_coor, ref, countlinearsplicedreads=True): # create the Genecount object gc = Gc.Genecount(tmp_dir) # generate a unique thread ID tid = id_generator() # create an (temporary) output file based on tid and file name output = tmp_dir + "tmp_" + os.path.basename(bamfile) + "_" + tid + "_junction.linear" print "Counting host gene expression based on " \ "detected and filtered circRNA coordinates for %s" % bamfile # launch the gene counting gc.comb_gen_count(circ_coor, bamfile, ref, output, countlinearsplicedreads) # return this input file's output name return output
def main(): parser = argparse.ArgumentParser( prog='DCC', formatter_class=argparse.RawDescriptionHelpFormatter, fromfile_prefix_chars='@', description='Contact [email protected]') parser.add_argument('--version', action='version', version='%(prog)s 0.3.2') parser.add_argument( "Input", metavar='Input', nargs="+", help= "Input of the chimeric.out.junction file from STAR. Alternatively, a sample sheet specifying where your chimeric.out.junction files are, each sample per line, provide with @ prefix (e.g. @samplesheet)." ) #parser.add_argument("-O", "--output", dest="output", # help="Tab delimited outputfile, order the same with input: \ # chr\tstart\tend\tstand\tcount\tjunctiontype") parser.add_argument("-temp", "--temp", dest="temp", action='store_true', default=False, help="Temporary files will not be deleted.") group = parser.add_argument_group( "Find circRNA Options", "Options to find circRNAs from STAR output.") group.add_argument( "-D", "--detect", action='store_true', dest="detect", default=False, help= "Always specify if you want detect circRNAs from chimeric junctions.") group.add_argument( "-ss", action='store_true', dest="secondstrand", default=False, help= "For stranded libraries, specify when the library is fr-secondstrand.") group.add_argument( "-N", "--nonstrand", action='store_false', dest="strand", default=True, help="Specify when the library is non-stranded [default stranded].") group.add_argument( "-E", "--endTol", dest="endTol", type=int, default=5, choices=range(0, 10), help= "Maximum base pair tolerance of reads extending over junction sites. [Interger, default 5]" ) group.add_argument( "-m", "--maximum", dest="max", type=int, default=1000000, help= "The maximum range of candidate circRNA allowed (including introns). [default 1000000]" ) group.add_argument( "-n", "--minimum", dest="min", type=int, default=30, help= "The minimum range of candidate circRNA allowed (including introns). [default 30]" ) group.add_argument( "-an", "--annotation", dest="annotate", help= "Gene annotation file in GTF/GFF3 format, to annotate circRNAs by their host gene name/identifier." ) #group.add_argument("-gf", "--getfasta", dest="getfasta", # help="Get fasta file of circular RNAs. If a exon annotation file is provided, the circular RNA sequence will only contain annotated exons, otherwise whole sequence.") group.add_argument( "-Pi", "--PE-independent", action='store_true', dest="pairedendindependent", default=False, help= "Specify when you have mapped the PE data mates separately. If specified, -mt1 and -mt2 should also be provied. [default False]" ) group.add_argument( "-mt1", "--mate1", dest="mate1", nargs='+', help= "For paired end data, Chimeric.out.juntion files from mate1 independent mapping result." ) group.add_argument( "-mt2", "--mate2", dest="mate2", nargs='+', help= "For paired end data, Chimeric.out.juntion files from mate2 independent mapping result." ) parser.add_argument_group(group) group = parser.add_argument_group( "Filtering Options", "Options to filter the circRNA candidates.") group.add_argument( "-F", "--filter", action='store_true', dest="filter", default=False, help= "If specified, the program will do filtering on the detection results." ) group.add_argument( "-M", "--chrM", action='store_true', dest="chrM", default=False, help= "If specified, candidates from mitochondria chromosome will be removed." ) #group.add_argument("-J", "--junction", dest="junction", # help="Provide a coustom junction file in gtf format, if only specify as True, only GT/AG or CT/AC junction will be considered.") group.add_argument( "-R", "--rep_file", dest="rep_file", help= "Custom repetitive region file in GTF format to filter out circRNAs candidates in repetitive regions." ) group.add_argument( "-L", "--Ln", dest="length", type=int, default=50, help="Minimum length to check for repetitive regions. [default 50]") group.add_argument('-Nr', nargs=2, type=int, metavar=('level1', 'threshold1'), default=[2, 5], help='Minimum read counts required for circRNAs; \ Minimum number of samples above the corresponding expression level' ) group.add_argument( "-fg", "--filterbygene", action='store_true', dest="filterbygene", default=False, help= "If specified, filter by gene annotation. Candidates are not allowed to span more than one gene." ) parser.add_argument_group(group) group = parser.add_argument_group( "Host gene count Options", "Options to count host gene expression.") group.add_argument( "-G", "--gene", action='store_true', dest="gene", default=False, help= "If specified, the program will count host gene expression given circRNA coordinates. By default, use the circRNA candidates detected from the same run." ) group.add_argument( "-C", "--circ", dest="circ", help= "User specified circRNA coordinates, any tab delimited file with first three columns as circRNA coordinates: chr\tstart\tend, which DCC will use to count host gene expression." ) group.add_argument( "-B", "--bam", dest="bam", nargs='+', help= "A file specify the mapped bam files from which host gene expression is computed. Must have the same order as input chimeric junction files." ) group.add_argument("-A", "--refseq", dest="refseq", help="Reference sequence fasta file.") #group.add_argument("-seq", "--seq", dest="seq", # help="Get circRNA sequence as fasta file.") parser.add_argument_group(group) options = parser.parse_args() timestr = time.strftime("%Y%m%d-%H%M%S") logging.basicConfig(filename='DCC.log' + timestr, filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s') #root = logging.getLogger() #root.setLevel(logging.DEBUG) # #ch = logging.StreamHandler(sys.stdout) #ch.setLevel(logging.DEBUG) #ch.filename='main.log' #ch.filemode='w' #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') #ch.setFormatter(formatter) #root.addHandler(ch) logging.info('version:0.3.2') logging.info(' '.join(sys.argv)) logging.info('DCC started') try: os.mkdir('_tmp_DCC') except OSError: from shutil import rmtree rmtree('_tmp_DCC/') os.mkdir('_tmp_DCC') # Get input file names filenames = [getfilename(name) for name in options.Input] samplelist = '\t'.join(filenames) # check whether the junction file names have duplicates same = False if len(set(filenames)) != len(options.Input): logging.info( 'Input file names have duplicates, add number suffix in input order to output files for distinction.' ) print( 'Input file names have duplicates, add number suffix in input order to output files for distinction.' ) same = True # Make instance cm = CC.Combine() circAnn = circAnnotate.CircAnnotate(strand=options.strand) if checkjunctionfiles(options.Input, options.mate1, options.mate2): logging.info('circRNA detection skipped due to empty junction files.') print('circRNA detection skipped due to empty junction files.') options.detect = False if options.detect: logging.info('Program start to detect circRNAs.') if options.strand: logging.info('Strand data.') else: logging.info( 'nonstrand data, the strand of circRNAs guessed from the strandness of host genes.' ) print 'WARNING: nonstrand data, the strand of circRNAs guessed from the strandness of host genes.' # Start de novo circular RNA detection model # Create instances f = FC.Findcirc(endTol=options.endTol, maxL=options.max, minL=options.min) sort = FC.Sort() circfiles = [] # A list for .circRNA file names def wrapfindcirc(files, output, strand=True, pairdendindependent=True): if pairdendindependent: f.printcircline(files, '_tmp_DCC/tmp_printcirclines') f.sepDuplicates('_tmp_DCC/tmp_printcirclines', '_tmp_DCC/tmp_duplicates', '_tmp_DCC/tmp_nonduplicates') # Find small circles f.smallcirc('_tmp_DCC/tmp_duplicates', '_tmp_DCC/tmp_smallcircs') if strand: # Find normal circles f.findcirc('_tmp_DCC/tmp_nonduplicates', '_tmp_DCC/tmp_normalcircs', strand=True) else: f.findcirc('_tmp_DCC/tmp_nonduplicates', '_tmp_DCC/tmp_normalcircs', strand=False) # Merge small and normal circles mergefiles('_tmp_DCC/tmp_findcirc', '_tmp_DCC/tmp_smallcircs', '_tmp_DCC/tmp_normalcircs') else: if strand: f.findcirc(files, '_tmp_DCC/tmp_findcirc', strand=True) else: f.findcirc(files, '_tmp_DCC/tmp_findcirc', strand=False) # Sort if strand: sort.sort_count('_tmp_DCC/tmp_findcirc', output, strand=True) else: sort.sort_count('_tmp_DCC/tmp_findcirc', output, strand=False) if options.pairedendindependent: print 'Please make sure that the read pairs have been mapped both, combined and on a per mate basis' logging.info( "Please make sure that the read pairs have been mapped both, combined and on a per mate basis" ) # Fix2chimera problem by STAR print('Collecting chimera from mates-separate mapping.') logging.info('Collecting chimera from mates-separate mapping.') Input = fixall(options.Input, options.mate1, options.mate2) else: Input = options.Input for indx, files in enumerate(Input): logging.info('started circRNA detection based on %s' % files) print 'started circRNA detection based on %s' % files if same: circfilename = '_tmp_DCC/' + getfilename(files) + str( indx) + '.circRNA' else: circfilename = '_tmp_DCC/' + getfilename(files) + '.circRNA' circfiles.append(circfilename) if options.strand: if options.pairedendindependent: wrapfindcirc(files, circfilename, strand=True, pairdendindependent=True) else: wrapfindcirc(files, circfilename, strand=True, pairdendindependent=False) else: if options.pairedendindependent: wrapfindcirc(files, circfilename, strand=False, pairdendindependent=True) else: wrapfindcirc(files, circfilename, strand=False, pairdendindependent=False) # #try: # os.remove('_tmp_DCC/tmp_findcirc') # os.remove('_tmp_DCC/tmp_printcirclines') # os.remove('_tmp_DCC/tmp_duplicates') # os.remove('_tmp_DCC/tmp_nonduplicates') # os.remove('_tmp_DCC/tmp_smallcircs') #except OSError: # pass ### Combine the individual count files # Create a list of '.circRNA' file names print('Combining individual circRNA read counts.') logging.info('Combining individual circRNA read counts.') cm.comb_coor(circfiles, strand=options.strand) cm.map('_tmp_DCC/tmp_coordinates', circfiles, strand=options.strand) res = cm.combine([files + 'mapped' for files in circfiles], col=7, circ=True) # swap strand if the sequences are sense strand if (options.secondstrand and options.strand): logging.info('Swapping strand information.') strand_swap = {} strand_swap['+\n'] = '-\n' strand_swap['-\n'] = '+\n' toswap = open('_tmp_DCC/tmp_coordinates').readlines() swaped = open('_tmp_DCC/tmp_coordinatesswaped', 'w') for lin in toswap: lin_split = lin.split('\t') lin_split[5] = strand_swap[lin_split[5]] swaped.write('\t'.join(lin_split)) swaped.close() os.remove('_tmp_DCC/tmp_coordinates') os.rename('_tmp_DCC/tmp_coordinatesswaped', '_tmp_DCC/tmp_coordinates') if options.filter: cm.writeouput('_tmp_DCC/tmp_circCount', res) if options.annotate: logging.info('Writing annotation.') logging.info('Selecting gene features in Annotation file.') circAnn.selectGeneGtf(options.annotate) circAnn.annotate( '_tmp_DCC/tmp_coordinates', '_tmp_DCC/tmp_' + getfilename(options.annotate) + '.gene', '_tmp_DCC/tmp_coordinatesannotated') os.remove('_tmp_DCC/tmp_coordinates') os.rename('_tmp_DCC/tmp_coordinatesannotated', '_tmp_DCC/tmp_coordinates') else: cm.writeouput('CircRNACount', res, samplelist, header=True) if options.annotate: logging.info('Write in annotation.') logging.info('Select gene features in Annotation file.') circAnn.selectGeneGtf(options.annotate) circAnn.annotate( '_tmp_DCC/tmp_coordinates', '_tmp_DCC/tmp_' + getfilename(options.annotate) + '.gene', 'CircCoordinates') circAnn.annotateregions('CircCoordinates', options.annotate) else: os.rename('_tmp_DCC/tmp_coordinates', 'CircCoordinates') ### Filtering if options.filter: logging.info('Filtering started') import circFilter as FT filt = FT.Circfilter(length=options.length, level1=options.Nr[0], threshold1=options.Nr[1]) if not options.detect and len( options.Input ) == 2: # Only use the program for filtering, need one coordinates file (bed6), one count file try: file2filter = options.Input[0] coorfile = options.Input[1] logging.info('Using files %s and %s for filtering' % (options.Input[0], options.Input[1])) print 'Using files %s and %s for filtering' % ( options.Input[0], options.Input[1]) except IndexError: sys.exit( 'Please check the input. If only use the program for filtering, a coordinate file in bed6 format and a count file is needed.' ) logging.error( 'Program exit because input error. Please check the input. If only use the program for filtering, a coordinate file in bed6 format and a count file is needed.' ) elif options.detect: file2filter = '_tmp_DCC/tmp_circCount' coorfile = '_tmp_DCC/tmp_coordinates' logging.info( 'Take file _tmp_DCC/tmp_circCount and _tmp_DCC/tmp_coordinates for filtering' ) print 'Using files _tmp_DCC/tmp_circCount and _tmp_DCC/tmp_coordinates for filtering' if options.rep_file: rep_file = options.rep_file else: from pkg_resources import resource_filename rep_file = resource_filename('DCC', 'data/DCC.Repeats') count, indx = filt.readcirc(file2filter, coorfile) logging.info('Filtering by read counts.') count0, indx0 = filt.filtercount( count, indx) # result of first filtering by read counts filt.makeregion(indx0) logging.info('Filtering by non repetitive regions.') nonrep_left, nonrep_right = filt.nonrep_filter('_tmp_DCC/tmp_left', '_tmp_DCC/tmp_right', rep_file) filt.intersectLeftandRightRegions(nonrep_left, nonrep_right, indx0, count0) if not options.chrM and not options.filterbygene: filt.sortOutput('_tmp_DCC/tmp_unsortedWithChrM', 'CircRNACount', samplelist, 'CircCoordinates', split=True) # Filter chrM, if no further filtering, return 'CircRNACount' and 'CircCoordinates', else return '_tmp_DCC/tmp_unsortedNoChrM' if options.chrM: logging.info( 'Deleting circRNA candidates from mitochondrial chromosome.') filt.removeChrM('_tmp_DCC/tmp_unsortedWithChrM') if not options.filterbygene: filt.sortOutput('_tmp_DCC/tmp_unsortedNoChrM', 'CircRNACount', samplelist, 'CircCoordinates', split=True) else: os.rename( '_tmp_DCC/tmp_unsortedWithChrM', '_tmp_DCC/tmp_unsortedNoChrM' ) # Note in this case '_tmp_DCC/tmp_unsortedNoChrM' actually has chrM # Filter by gene annotation, require one circRNA could not from more than one gene. return final 'CircRNACount' and 'CircCoordinates' if options.filterbygene: if options.annotate: logging.info( 'Filtering by gene annotation. CircRNA candidates from more than one genes are deleted.' ) circAnn.filtbygene('_tmp_DCC/tmp_unsortedNoChrM', '_tmp_DCC/tmp_unsortedfilterbygene') filt.sortOutput('_tmp_DCC/tmp_unsortedfilterbygene', 'CircRNACount', samplelist, 'CircCoordinates', split=True) else: logging.warning( 'To filter by gene annotation, a annotation file in GTF/GFF format needed, skiped filter by gene annotation.' ) filt.sortOutput('_tmp_DCC/tmp_unsortedNoChrM', 'CircRNACount', samplelist, 'CircCoordinates', split=True) # Add annotation of regions if options.annotate: circAnn.annotateregions('CircCoordinates', options.annotate) logging.info('Filtering finished') if options.gene: import genecount as GC # import the list of bamfile names as a file if not options.bam: #print 'Please provide bam files, program will not count host gene expression.' logging.info( 'Look for mapped bam files in the same directory as chimeric.out.junction files.' ) bamfiles = convertjunctionfile2bamfile(options.Input) else: bamfiles = options.bam if not options.refseq: print 'Please provide reference sequence, program will not count host gene expression.' logging.warning( 'Please provide reference sequence, program will not count host gene expression.' ) if options.refseq: # check whether the number of bamfiles is equale to the number of chimeric.junction.out files if len(bamfiles) != len(options.Input): logging.error( "The number of bam files does not match with chimeric junction files." ) sys.exit( "The number of bam files does not match with chimeric junction files." ) else: # For each sample (each bamfile), do one host gene count, and then combine to a single table gc = GC.Genecount() linearfiles = [] # A list for .linear file names for indx, files in enumerate(options.Input): if same: linearfilename = '_tmp_DCC/' + getfilename( files) + str(indx) + '.linear' else: linearfilename = '_tmp_DCC/' + getfilename( files) + '.linear' linearfiles.append(linearfilename) for indx, bamfile in enumerate(bamfiles): if options.circ: logging.info( 'Counting linear gene expression based on provided circRNA coordinates for %s' % bamfile) #print 'Counting linear gene expression based on provided circRNA coordinates' gc.comb_gen_count(options.circ, bamfile, options.refseq, linearfiles[indx], countlinearsplicedreads=False) else: logging.info( 'Counting host gene expression based on detected and filtered circRNA coordinates for %s' % bamfile) print 'Counting host gene expression based on detected and filtered circRNA coordinates' gc.comb_gen_count('CircRNACount', bamfile, options.refseq, linearfiles[indx], countlinearsplicedreads=False) logging.info( "Finished linear gene expression counting, start to combine individual sample counts" ) # Combine all to a individual sample host gene count to a single table res = cm.combine(linearfiles, col=6, circ=False) cm.writeouput('LinearCount', res, samplelist, header=True) logging.info( 'Finished combine individual linear gene expression counts') if not options.temp: deleted = cm.deletfile(os.getcwd(), linearfiles) logdeleted(deleted) # CircSkip junction if options.annotate and not options.circ: logging.info('Count CircSkip junctions.') print('Count CircSkip junctions.') SJ_out_tab = getSJ_out_tab(options.Input) CircSkipfiles = findCircSkipJunction('CircCoordinates', options.annotate, circfiles, SJ_out_tab, strand=options.strand, same=same) fin = open('CircCoordinates', 'r').readlines()[1:] with open('_tmp_DCC/tmp_CircCoordinatesNoheader', 'w') as fout: fout.writelines(fin) cm.map('_tmp_DCC/tmp_CircCoordinatesNoheader', CircSkipfiles, strand=options.strand, col=4) CircSkipfilesmapped = [fname + 'mapped' for fname in CircSkipfiles] res = cm.combine(CircSkipfilesmapped, col=9) cm.writeouput('CircSkipJunctions', res, samplelist, header=True) else: logging.info('CircSkip junctions cannot be count.') # Delete temporary files if not options.temp: p3 = r'^tmp_\.*' deleted = cm.deletfile(os.path.join(os.getcwd(), '_tmp_DCC/'), p3) logdeleted(deleted) deleted = cm.deletfile( os.getcwd(), circfiles + [files + 'mapped' for files in circfiles]) logdeleted(deleted) deleted = cm.deletfile('', CircSkipfiles) logdeleted(deleted) deleted = cm.deletfile('', CircSkipfilesmapped) logdeleted(deleted) logging.info('DCC completed successfully.')