Python Genecount Examples

Programming Language: Python

Namespace/Package Name: genecount

Method/Function: Genecount

Examples at hotexamples.com: 2

Python Genecount - 2 examples found. These are the top rated real world Python examples of genecount.Genecount extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: main.py Project: Scathacheng/DCC

def wraphostgenecount(bamfile, tmp_dir, circ_coor, ref, countlinearsplicedreads=True):
    # create the Genecount object
    gc = Gc.Genecount(tmp_dir)

    # generate a unique thread ID
    tid = id_generator()

    # create an (temporary) output file based on tid and file name
    output = tmp_dir + "tmp_" + os.path.basename(bamfile) + "_" + tid + "_junction.linear"

    print "Counting host gene expression based on " \
          "detected and filtered circRNA coordinates for %s" % bamfile

    # launch the gene counting
    gc.comb_gen_count(circ_coor, bamfile, ref, output, countlinearsplicedreads)

    # return this input file's output name
    return output

Example #2

Show file

File: main.py Project: Voineagulab/DCC

def main():

    parser = argparse.ArgumentParser(
        prog='DCC',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        fromfile_prefix_chars='@',
        description='Contact [email protected]')

    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s 0.3.2')
    parser.add_argument(
        "Input",
        metavar='Input',
        nargs="+",
        help=
        "Input of the chimeric.out.junction file from STAR. Alternatively, a sample sheet specifying where your chimeric.out.junction files are, each sample per line, provide with @ prefix (e.g. @samplesheet)."
    )
    #parser.add_argument("-O", "--output", dest="output",
    #                  help="Tab delimited outputfile, order the same with input: \
    #                  chr\tstart\tend\tstand\tcount\tjunctiontype")
    parser.add_argument("-temp",
                        "--temp",
                        dest="temp",
                        action='store_true',
                        default=False,
                        help="Temporary files will not be deleted.")

    group = parser.add_argument_group(
        "Find circRNA Options", "Options to find circRNAs from STAR output.")
    group.add_argument(
        "-D",
        "--detect",
        action='store_true',
        dest="detect",
        default=False,
        help=
        "Always specify if you want detect circRNAs from chimeric junctions.")
    group.add_argument(
        "-ss",
        action='store_true',
        dest="secondstrand",
        default=False,
        help=
        "For stranded libraries, specify when the library is fr-secondstrand.")
    group.add_argument(
        "-N",
        "--nonstrand",
        action='store_false',
        dest="strand",
        default=True,
        help="Specify when the library is non-stranded [default stranded].")
    group.add_argument(
        "-E",
        "--endTol",
        dest="endTol",
        type=int,
        default=5,
        choices=range(0, 10),
        help=
        "Maximum base pair tolerance of reads extending over junction sites. [Interger, default 5]"
    )
    group.add_argument(
        "-m",
        "--maximum",
        dest="max",
        type=int,
        default=1000000,
        help=
        "The maximum range of candidate circRNA allowed (including introns). [default 1000000]"
    )
    group.add_argument(
        "-n",
        "--minimum",
        dest="min",
        type=int,
        default=30,
        help=
        "The minimum range of candidate circRNA allowed (including introns). [default 30]"
    )
    group.add_argument(
        "-an",
        "--annotation",
        dest="annotate",
        help=
        "Gene annotation file in GTF/GFF3 format, to annotate circRNAs by their host gene name/identifier."
    )
    #group.add_argument("-gf", "--getfasta", dest="getfasta",
    #                  help="Get fasta file of circular RNAs. If a exon annotation file is provided, the circular RNA sequence will only contain annotated exons, otherwise whole sequence.")
    group.add_argument(
        "-Pi",
        "--PE-independent",
        action='store_true',
        dest="pairedendindependent",
        default=False,
        help=
        "Specify when you have mapped the PE data mates separately. If specified, -mt1 and -mt2 should also be provied. [default False]"
    )
    group.add_argument(
        "-mt1",
        "--mate1",
        dest="mate1",
        nargs='+',
        help=
        "For paired end data, Chimeric.out.juntion files from mate1 independent mapping result."
    )
    group.add_argument(
        "-mt2",
        "--mate2",
        dest="mate2",
        nargs='+',
        help=
        "For paired end data, Chimeric.out.juntion files from mate2 independent mapping result."
    )
    parser.add_argument_group(group)

    group = parser.add_argument_group(
        "Filtering Options", "Options to filter the circRNA candidates.")
    group.add_argument(
        "-F",
        "--filter",
        action='store_true',
        dest="filter",
        default=False,
        help=
        "If specified, the program will do filtering on the detection results."
    )
    group.add_argument(
        "-M",
        "--chrM",
        action='store_true',
        dest="chrM",
        default=False,
        help=
        "If specified, candidates from mitochondria chromosome will be removed."
    )
    #group.add_argument("-J", "--junction", dest="junction",
    #                  help="Provide a coustom junction file in gtf format, if only specify as True, only GT/AG or CT/AC junction will be considered.")
    group.add_argument(
        "-R",
        "--rep_file",
        dest="rep_file",
        help=
        "Custom repetitive region file in GTF format to filter out circRNAs candidates in repetitive regions."
    )
    group.add_argument(
        "-L",
        "--Ln",
        dest="length",
        type=int,
        default=50,
        help="Minimum length to check for repetitive regions. [default 50]")
    group.add_argument('-Nr',
                       nargs=2,
                       type=int,
                       metavar=('level1', 'threshold1'),
                       default=[2, 5],
                       help='Minimum read counts required for circRNAs; \
                        Minimum number of samples above the corresponding expression level'
                       )
    group.add_argument(
        "-fg",
        "--filterbygene",
        action='store_true',
        dest="filterbygene",
        default=False,
        help=
        "If specified, filter by gene annotation. Candidates are not allowed to span more than one gene."
    )
    parser.add_argument_group(group)

    group = parser.add_argument_group(
        "Host gene count Options", "Options to count host gene expression.")
    group.add_argument(
        "-G",
        "--gene",
        action='store_true',
        dest="gene",
        default=False,
        help=
        "If specified, the program will count host gene expression given circRNA coordinates. By default, use the circRNA candidates detected from the same run."
    )
    group.add_argument(
        "-C",
        "--circ",
        dest="circ",
        help=
        "User specified circRNA coordinates, any tab delimited file with first three columns as circRNA coordinates: chr\tstart\tend, which DCC will use to count host gene expression."
    )
    group.add_argument(
        "-B",
        "--bam",
        dest="bam",
        nargs='+',
        help=
        "A file specify the mapped bam files from which host gene expression is computed. Must have the same order as input chimeric junction files."
    )
    group.add_argument("-A",
                       "--refseq",
                       dest="refseq",
                       help="Reference sequence fasta file.")
    #group.add_argument("-seq", "--seq", dest="seq",
    #                  help="Get circRNA sequence as fasta file.")
    parser.add_argument_group(group)

    options = parser.parse_args()

    timestr = time.strftime("%Y%m%d-%H%M%S")
    logging.basicConfig(filename='DCC.log' + timestr,
                        filemode='w',
                        level=logging.DEBUG,
                        format='%(asctime)s %(message)s')

    #root = logging.getLogger()
    #root.setLevel(logging.DEBUG)
    #
    #ch = logging.StreamHandler(sys.stdout)
    #ch.setLevel(logging.DEBUG)
    #ch.filename='main.log'
    #ch.filemode='w'
    #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    #ch.setFormatter(formatter)
    #root.addHandler(ch)
    logging.info('version:0.3.2')
    logging.info(' '.join(sys.argv))
    logging.info('DCC started')

    try:
        os.mkdir('_tmp_DCC')
    except OSError:
        from shutil import rmtree
        rmtree('_tmp_DCC/')
        os.mkdir('_tmp_DCC')

    # Get input file names
    filenames = [getfilename(name) for name in options.Input]
    samplelist = '\t'.join(filenames)

    # check whether the junction file names have duplicates
    same = False
    if len(set(filenames)) != len(options.Input):
        logging.info(
            'Input file names have duplicates, add number suffix in input order to output files for distinction.'
        )
        print(
            'Input file names have duplicates, add number suffix in input order to output files for distinction.'
        )
        same = True

    # Make instance
    cm = CC.Combine()
    circAnn = circAnnotate.CircAnnotate(strand=options.strand)

    if checkjunctionfiles(options.Input, options.mate1, options.mate2):
        logging.info('circRNA detection skipped due to empty junction files.')
        print('circRNA detection skipped due to empty junction files.')

        options.detect = False

    if options.detect:
        logging.info('Program start to detect circRNAs.')
        if options.strand:
            logging.info('Strand data.')
        else:
            logging.info(
                'nonstrand data, the strand of circRNAs guessed from the strandness of host genes.'
            )
            print 'WARNING: nonstrand data, the strand of circRNAs guessed from the strandness of host genes.'

        # Start de novo circular RNA detection model
        # Create instances
        f = FC.Findcirc(endTol=options.endTol,
                        maxL=options.max,
                        minL=options.min)
        sort = FC.Sort()

        circfiles = []  # A list for .circRNA file names

        def wrapfindcirc(files, output, strand=True, pairdendindependent=True):
            if pairdendindependent:
                f.printcircline(files, '_tmp_DCC/tmp_printcirclines')
                f.sepDuplicates('_tmp_DCC/tmp_printcirclines',
                                '_tmp_DCC/tmp_duplicates',
                                '_tmp_DCC/tmp_nonduplicates')
                # Find small circles
                f.smallcirc('_tmp_DCC/tmp_duplicates',
                            '_tmp_DCC/tmp_smallcircs')
                if strand:
                    # Find normal circles
                    f.findcirc('_tmp_DCC/tmp_nonduplicates',
                               '_tmp_DCC/tmp_normalcircs',
                               strand=True)
                else:
                    f.findcirc('_tmp_DCC/tmp_nonduplicates',
                               '_tmp_DCC/tmp_normalcircs',
                               strand=False)
                # Merge small and normal circles
                mergefiles('_tmp_DCC/tmp_findcirc', '_tmp_DCC/tmp_smallcircs',
                           '_tmp_DCC/tmp_normalcircs')
            else:
                if strand:
                    f.findcirc(files, '_tmp_DCC/tmp_findcirc', strand=True)
                else:
                    f.findcirc(files, '_tmp_DCC/tmp_findcirc', strand=False)
            # Sort
            if strand:
                sort.sort_count('_tmp_DCC/tmp_findcirc', output, strand=True)
            else:
                sort.sort_count('_tmp_DCC/tmp_findcirc', output, strand=False)

        if options.pairedendindependent:
            print 'Please make sure that the read pairs have been mapped both, combined and on a per mate basis'
            logging.info(
                "Please make sure that the read pairs have been mapped both, combined and on a per mate basis"
            )

            # Fix2chimera problem by STAR
            print('Collecting chimera from mates-separate mapping.')
            logging.info('Collecting chimera from mates-separate mapping.')
            Input = fixall(options.Input, options.mate1, options.mate2)
        else:
            Input = options.Input

        for indx, files in enumerate(Input):
            logging.info('started circRNA detection based on %s' % files)
            print 'started circRNA detection based on %s' % files
            if same:
                circfilename = '_tmp_DCC/' + getfilename(files) + str(
                    indx) + '.circRNA'
            else:
                circfilename = '_tmp_DCC/' + getfilename(files) + '.circRNA'
            circfiles.append(circfilename)

            if options.strand:
                if options.pairedendindependent:
                    wrapfindcirc(files,
                                 circfilename,
                                 strand=True,
                                 pairdendindependent=True)
                else:
                    wrapfindcirc(files,
                                 circfilename,
                                 strand=True,
                                 pairdendindependent=False)

            else:
                if options.pairedendindependent:
                    wrapfindcirc(files,
                                 circfilename,
                                 strand=False,
                                 pairdendindependent=True)
                else:
                    wrapfindcirc(files,
                                 circfilename,
                                 strand=False,
                                 pairdendindependent=False)
        #
        #try:
        #    os.remove('_tmp_DCC/tmp_findcirc')
        #    os.remove('_tmp_DCC/tmp_printcirclines')
        #    os.remove('_tmp_DCC/tmp_duplicates')
        #    os.remove('_tmp_DCC/tmp_nonduplicates')
        #    os.remove('_tmp_DCC/tmp_smallcircs')
        #except OSError:
        #    pass

        ### Combine the individual count files
        # Create a list of '.circRNA' file names
        print('Combining individual circRNA read counts.')
        logging.info('Combining individual circRNA read counts.')

        cm.comb_coor(circfiles, strand=options.strand)
        cm.map('_tmp_DCC/tmp_coordinates', circfiles, strand=options.strand)

        res = cm.combine([files + 'mapped' for files in circfiles],
                         col=7,
                         circ=True)

        # swap strand if the sequences are sense strand
        if (options.secondstrand and options.strand):
            logging.info('Swapping strand information.')
            strand_swap = {}
            strand_swap['+\n'] = '-\n'
            strand_swap['-\n'] = '+\n'
            toswap = open('_tmp_DCC/tmp_coordinates').readlines()
            swaped = open('_tmp_DCC/tmp_coordinatesswaped', 'w')
            for lin in toswap:
                lin_split = lin.split('\t')
                lin_split[5] = strand_swap[lin_split[5]]
                swaped.write('\t'.join(lin_split))
            swaped.close()
            os.remove('_tmp_DCC/tmp_coordinates')
            os.rename('_tmp_DCC/tmp_coordinatesswaped',
                      '_tmp_DCC/tmp_coordinates')

        if options.filter:
            cm.writeouput('_tmp_DCC/tmp_circCount', res)
            if options.annotate:
                logging.info('Writing annotation.')
                logging.info('Selecting gene features in Annotation file.')
                circAnn.selectGeneGtf(options.annotate)
                circAnn.annotate(
                    '_tmp_DCC/tmp_coordinates',
                    '_tmp_DCC/tmp_' + getfilename(options.annotate) + '.gene',
                    '_tmp_DCC/tmp_coordinatesannotated')
                os.remove('_tmp_DCC/tmp_coordinates')
                os.rename('_tmp_DCC/tmp_coordinatesannotated',
                          '_tmp_DCC/tmp_coordinates')
        else:
            cm.writeouput('CircRNACount', res, samplelist, header=True)
            if options.annotate:
                logging.info('Write in annotation.')
                logging.info('Select gene features in Annotation file.')
                circAnn.selectGeneGtf(options.annotate)
                circAnn.annotate(
                    '_tmp_DCC/tmp_coordinates',
                    '_tmp_DCC/tmp_' + getfilename(options.annotate) + '.gene',
                    'CircCoordinates')
                circAnn.annotateregions('CircCoordinates', options.annotate)
            else:
                os.rename('_tmp_DCC/tmp_coordinates', 'CircCoordinates')

    ### Filtering
    if options.filter:
        logging.info('Filtering started')

        import circFilter as FT
        filt = FT.Circfilter(length=options.length,
                             level1=options.Nr[0],
                             threshold1=options.Nr[1])

        if not options.detect and len(
                options.Input
        ) == 2:  # Only use the program for filtering, need one coordinates file (bed6), one count file
            try:
                file2filter = options.Input[0]
                coorfile = options.Input[1]
                logging.info('Using files %s and %s for filtering' %
                             (options.Input[0], options.Input[1]))
                print 'Using files %s and %s for filtering' % (
                    options.Input[0], options.Input[1])
            except IndexError:
                sys.exit(
                    'Please check the input. If only use the program for filtering, a coordinate file in bed6 format and a count file is needed.'
                )
                logging.error(
                    'Program exit because input error. Please check the input. If only use the program for filtering, a coordinate file in bed6 format and a count file is needed.'
                )

        elif options.detect:
            file2filter = '_tmp_DCC/tmp_circCount'
            coorfile = '_tmp_DCC/tmp_coordinates'
            logging.info(
                'Take file _tmp_DCC/tmp_circCount and _tmp_DCC/tmp_coordinates for filtering'
            )
            print 'Using files _tmp_DCC/tmp_circCount and _tmp_DCC/tmp_coordinates for filtering'

        if options.rep_file:
            rep_file = options.rep_file
        else:
            from pkg_resources import resource_filename
            rep_file = resource_filename('DCC', 'data/DCC.Repeats')
        count, indx = filt.readcirc(file2filter, coorfile)
        logging.info('Filtering by read counts.')
        count0, indx0 = filt.filtercount(
            count, indx)  # result of first filtering by read counts
        filt.makeregion(indx0)
        logging.info('Filtering by non repetitive regions.')
        nonrep_left, nonrep_right = filt.nonrep_filter('_tmp_DCC/tmp_left',
                                                       '_tmp_DCC/tmp_right',
                                                       rep_file)
        filt.intersectLeftandRightRegions(nonrep_left, nonrep_right, indx0,
                                          count0)
        if not options.chrM and not options.filterbygene:
            filt.sortOutput('_tmp_DCC/tmp_unsortedWithChrM',
                            'CircRNACount',
                            samplelist,
                            'CircCoordinates',
                            split=True)

        # Filter chrM, if no further filtering, return 'CircRNACount' and 'CircCoordinates', else return '_tmp_DCC/tmp_unsortedNoChrM'
        if options.chrM:
            logging.info(
                'Deleting circRNA candidates from mitochondrial chromosome.')
            filt.removeChrM('_tmp_DCC/tmp_unsortedWithChrM')
            if not options.filterbygene:
                filt.sortOutput('_tmp_DCC/tmp_unsortedNoChrM',
                                'CircRNACount',
                                samplelist,
                                'CircCoordinates',
                                split=True)
        else:
            os.rename(
                '_tmp_DCC/tmp_unsortedWithChrM', '_tmp_DCC/tmp_unsortedNoChrM'
            )  # Note in this case '_tmp_DCC/tmp_unsortedNoChrM' actually has chrM

        # Filter by gene annotation, require one circRNA could not from more than one gene. return final 'CircRNACount' and 'CircCoordinates'
        if options.filterbygene:
            if options.annotate:
                logging.info(
                    'Filtering by gene annotation. CircRNA candidates from more than one genes are deleted.'
                )
                circAnn.filtbygene('_tmp_DCC/tmp_unsortedNoChrM',
                                   '_tmp_DCC/tmp_unsortedfilterbygene')
                filt.sortOutput('_tmp_DCC/tmp_unsortedfilterbygene',
                                'CircRNACount',
                                samplelist,
                                'CircCoordinates',
                                split=True)
            else:
                logging.warning(
                    'To filter by gene annotation, a annotation file in GTF/GFF format needed, skiped filter by gene annotation.'
                )
                filt.sortOutput('_tmp_DCC/tmp_unsortedNoChrM',
                                'CircRNACount',
                                samplelist,
                                'CircCoordinates',
                                split=True)

        # Add annotation of regions
        if options.annotate:
            circAnn.annotateregions('CircCoordinates', options.annotate)

        logging.info('Filtering finished')

    if options.gene:
        import genecount as GC
        # import the list of bamfile names as a file
        if not options.bam:
            #print 'Please provide bam files, program will not count host gene expression.'
            logging.info(
                'Look for mapped bam files in the same directory as chimeric.out.junction files.'
            )
            bamfiles = convertjunctionfile2bamfile(options.Input)
        else:
            bamfiles = options.bam

        if not options.refseq:
            print 'Please provide reference sequence, program will not count host gene expression.'
            logging.warning(
                'Please provide reference sequence, program will not count host gene expression.'
            )

        if options.refseq:
            # check whether the number of bamfiles is equale to the number of chimeric.junction.out files
            if len(bamfiles) != len(options.Input):
                logging.error(
                    "The number of bam files does not match with chimeric junction files."
                )
                sys.exit(
                    "The number of bam files does not match with chimeric junction files."
                )
            else:
                # For each sample (each bamfile), do one host gene count, and then combine to a single table
                gc = GC.Genecount()

                linearfiles = []  # A list for .linear file names

                for indx, files in enumerate(options.Input):

                    if same:
                        linearfilename = '_tmp_DCC/' + getfilename(
                            files) + str(indx) + '.linear'
                    else:
                        linearfilename = '_tmp_DCC/' + getfilename(
                            files) + '.linear'
                    linearfiles.append(linearfilename)

                for indx, bamfile in enumerate(bamfiles):
                    if options.circ:
                        logging.info(
                            'Counting linear gene expression based on provided circRNA coordinates for %s'
                            % bamfile)
                        #print 'Counting linear gene expression based on provided circRNA coordinates'

                        gc.comb_gen_count(options.circ,
                                          bamfile,
                                          options.refseq,
                                          linearfiles[indx],
                                          countlinearsplicedreads=False)
                    else:
                        logging.info(
                            'Counting host gene expression based on detected and filtered circRNA coordinates for %s'
                            % bamfile)
                        print 'Counting host gene expression based on detected and filtered circRNA coordinates'
                        gc.comb_gen_count('CircRNACount',
                                          bamfile,
                                          options.refseq,
                                          linearfiles[indx],
                                          countlinearsplicedreads=False)

                logging.info(
                    "Finished linear gene expression counting, start to combine individual sample counts"
                )

            # Combine all to a individual sample host gene count to a single table
            res = cm.combine(linearfiles, col=6, circ=False)
            cm.writeouput('LinearCount', res, samplelist, header=True)
            logging.info(
                'Finished combine individual linear gene expression counts')

            if not options.temp:
                deleted = cm.deletfile(os.getcwd(), linearfiles)
                logdeleted(deleted)

    # CircSkip junction
    if options.annotate and not options.circ:
        logging.info('Count CircSkip junctions.')
        print('Count CircSkip junctions.')
        SJ_out_tab = getSJ_out_tab(options.Input)
        CircSkipfiles = findCircSkipJunction('CircCoordinates',
                                             options.annotate,
                                             circfiles,
                                             SJ_out_tab,
                                             strand=options.strand,
                                             same=same)
        fin = open('CircCoordinates', 'r').readlines()[1:]
        with open('_tmp_DCC/tmp_CircCoordinatesNoheader', 'w') as fout:
            fout.writelines(fin)
        cm.map('_tmp_DCC/tmp_CircCoordinatesNoheader',
               CircSkipfiles,
               strand=options.strand,
               col=4)
        CircSkipfilesmapped = [fname + 'mapped' for fname in CircSkipfiles]
        res = cm.combine(CircSkipfilesmapped, col=9)
        cm.writeouput('CircSkipJunctions', res, samplelist, header=True)
    else:
        logging.info('CircSkip junctions cannot be count.')

    # Delete temporary files
    if not options.temp:
        p3 = r'^tmp_\.*'
        deleted = cm.deletfile(os.path.join(os.getcwd(), '_tmp_DCC/'), p3)
        logdeleted(deleted)
        deleted = cm.deletfile(
            os.getcwd(), circfiles + [files + 'mapped' for files in circfiles])
        logdeleted(deleted)
        deleted = cm.deletfile('', CircSkipfiles)
        logdeleted(deleted)
        deleted = cm.deletfile('', CircSkipfilesmapped)
        logdeleted(deleted)
    logging.info('DCC completed successfully.')