Python PipelineLncRNA Beispiele

Programmiersprache: Python

Namespace / Paketname: CGATPipelines

Klasse / Typ: PipelineLncRNA

Beispiele auf hotexamples.com: 25

Python PipelineLncRNA - 25 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die CGATPipelines.PipelineLncRNA, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

CounterMultiExonGenes(3)

CounterSingleExonGenes(3)

flagExonStatus(3)

classifyLncRNAGenes(3)

CounterExonsPerGene(2)

extractMAFGeneBlocks(2)

CounterGenes(2)

CounterMultiExonTranscripts(2)

CounterSingleExonTranscripts(2)

CounterTranscripts(2)

buildFilteredLncRNAGeneSet(2)

CounterExonsPerTranscript(2)

gtfToBed12(2)

parsePhyloCSF(1)

filterMAF(1)

buildFinalLncRNAGeneSet(1)

buildRefnoncodingGeneSet(1)

buildRefcodingGeneSet(1)

buildLncRNAGeneSet(1)

buildCodingGeneSet(1)

splitAlignedFasta(1)

Beispiel #1

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def classifyLncRNA(infiles, outfile):
    '''

    Classify lncRNA realtive to protein coding loci

    Classify lincRNA in terms of their relationship to 
    protein coding genes - creates indices for intervals on the 
    fly - mayb should be creating additional annotations:

    antisense
       transcript overlapping protein coding exons on opposite strand
    antisense_upstream
       transcript < 2kb from tss on opposite strand
    antisense_downstream 
       transcript < 2kb from gene end on opposite strand
    sense_upstream
       transcript < 2kb from tss on same strand
    sense_downstream
       transcript < 2kb from gene end on same strand
    intergenic
       transcript >2kb from any protein coding gene
    intronic
       overlaps protein coding gene intron on same strand
    antisense_intronic
       overlaps protein coding intron on opposite strand
    '''

    PipelineLncRNA.classifyLncRNAGenes(
        infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])

Beispiel #2

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: lesheng/cgat

def flagExonStatus(infile, outfile):
    '''
    Adds two attributes to the gtf entry:
    exon_status_locus - specifies whether the gene model is multi- or single exon
    exon_status - specifies whether the transcript is mult- or single exon
    '''

    PipelineLncRNA.flagExonStatus(infile, outfile)

Beispiel #3

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def classifyFilteredLncRNA(infiles, outfile):
    '''
    classifies all lincRNA before cpc filtering to define any classes that
    are represented in the coding set that are  filtered
    NOTE: This task is not included when running the full pipeline
    '''
    PipelineLncRNA.classifyLncRNAGenes(
        infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])

Beispiel #4

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: nishantthakur/cgat

 def buildFilteredLncRNAGeneSet(infiles, outfile):
     """
     Creates a filtered lncRNA geneset. 
     This geneset will not include any single exon lncRNA 
     unless it has been seen previously i.e. it overlaps
     a previously identified lncRNA
     """
     PipelineLncRNA.buildFilteredLncRNAGeneSet(infiles[0], outfile, infiles[1 : len(infiles)])

Beispiel #5

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: nishantthakur/cgat

def buildFinalLncRNAGeneSet(infile, outfile):
    """
    the final lncRNA gene set consists of transcripts that pass
    the initial filtering stage i.e. are;
    multi-exonic/previously seen single exon transcripts
    display low evidence for coding potential
    """

    # filter based on coding potential
    PipelineLncRNA.buildFinalLncRNAGeneSet(infile, "lncrna_filtered_cpc_result", outfile, PARAMS["filtering_cpc"])

Beispiel #6

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def splitLncRNAFasta(infile, outfiles):
    out_dir = "./phyloCSF/lncrna_fasta"

    name_dict = {}
    for mapping in PARAMS["phyloCSF_map_species_names"].split(","):
        pair = mapping.split(":")
        key = ">" + pair[0]
        value = ">" + pair[1]
        name_dict[key] = value
    E.info("Name mapping: %s" % name_dict)

    PipelineLncRNA.splitAlignedFasta(infile, out_dir, name_dict)

Beispiel #7

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: yangjl/cgat

def buildLncRNAGeneSet(infiles, outfile):
    '''
    build lncRNA gene set. 
    
    This is a set of transcripts in the abinitio set that
    do not overlap at any protein coding or pseudogene transcripts
    or additional biotypes from ensembl that are unwanted
    (exons) in a reference gene set.
    
    Transcripts need to have a length of at least 200 bp.
    '''
    PipelineLncRNA.buildLncRNAGeneSet( infiles[0], infiles[1], infiles[2], infiles[3], infiles[4], outfile, PARAMS["lncrna_min_length"] )

Beispiel #8

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def buildRefnoncodingGeneSet(infile, outfile):
    '''
    filter the refnoncoding geneset for things that are described in ensembl
    as being:
    Ambiguous_orf
    Retained_intron
    Sense_intronic
    antisense
    Sense_overlapping
    Processed transcript
    '''
    PipelineLncRNA.buildRefnoncodingGeneSet(infile, outfile)

Beispiel #9

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

 def buildFilteredLncRNAGeneSet(infiles, outfile):
     '''
     Creates a filtered lncRNA geneset. That contains previously identified
     gene models supplied in contig file.
     '''
     assert PARAMS["filtering_remove_single_exon"] in ["loci",
                                                       "transcripts",
                                                       None]
     PipelineLncRNA.buildFilteredLncRNAGeneSet(
         infiles[0],
         outfile,
         infiles[1:len(infiles)],
         filter_se=PARAMS["filtering_remove_single_exon"])

Beispiel #10

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

Beispiel #11

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()

Beispiel #12

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def buildCodingGeneSet(infiles, outfile):
    '''
    takes the output from cuffcompare of a transcript
    assembly and filters for annotated protein coding
    genes. 

    NB "pruned" refers to nomenclature in the transcript
    building pipeline - transcripts that appear in at least
    two samples.

    Because an abinitio assembly will often contain
    fragments of known transcripts and describe them as 
    novel, the default behaviour is to produce a set that
    is composed of 'complete' or 'contained' transcripts
    i.e. nothing novel. This may underestimate the number 
    of transcripts that are actually expressed
    '''
    PipelineLncRNA.buildCodingGeneSet(infiles[0], infiles[1], outfile)

Beispiel #13

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: lesheng/cgat

def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes. 
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)


    outf = open(outfile, "w")
    outf.write("\t".join(["no_transcripts", 
                          "no_genes", 
                          "no_exons_per_transcript", 
                          "no_exons_per_gene",
                          "no_single_exon_transcripts", 
                          "no_multi_exon_transcripts", 
                          "no_single_exon_genes", 
                          "no_multi_exon_genes"]) + "\n")
    outf.write("\t".join(map(str, [PipelineLncRNA.CounterTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerGene(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonGenes(tmpf).count()])))


    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))

Beispiel #14

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

Beispiel #15

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def convertControlGTFToBed12(infile, outfile):
    """
    Convert either ensembl lincRNA, or control gtf to bed12 format
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Beispiel #16

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def convertGTFToBed12(infile, outfile):
    """
    Transform the lncrna_final.gtf.gz into lncrna_final.bed
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Beispiel #17

Datei anzeigen

def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--add-index=gene_id")

Beispiel #18

Datei anzeigen

def convertControlGTFToBed12(infile, outfile):
    """
    Convert either ensembl lincRNA, or control gtf to bed12 format
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Beispiel #19

Datei anzeigen

Datei: Summary.py Projekt: santayana/cgat

    def __call__(self, track, slice=None):

        return odict((("single_exon", PipelineLncRNA.CounterSingleExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count()), ("multi_exon", PipelineLncRNA.CounterMultiExonGenes(os.path.join("gtfs", track) + ".gtf.gz").count())))

Beispiel #20

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def buildRefcodingGeneSet(infiles, outfile):
    '''
    builds a refcoding geneset based on the genes that are present in
    the abinitio assembly
    '''
    PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)

Beispiel #21

Datei anzeigen

def convertGTFToBed12(infile, outfile):
    """
    Transform the lncrna_final.gtf.gz into lncrna_final.bed
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")

Beispiel #22

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: nishantthakur/cgat

def flagExonStatus(infile, outfile):
    """
    Adds an attribute to the gtf entry dependent on whether the lncRNA 
    is multi or single exon
    """
    PipelineLncRNA.flagExonStatus(infile, outfile)

Beispiel #23

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: nishantthakur/cgat

def classifyFilteredLncRNA(infiles, outfile):
    """
    classifies all lincRNA before cpc filtering to define any classes that
    are represented in the coding set that are  filtered
    """
    PipelineLncRNA.classifyLncRNAGenes(infiles[0], infiles[1], outfile, dist=PARAMS["lncrna_dist"])

Beispiel #24

Datei anzeigen

Datei: pipeline_rnaseqlncrna.py Projekt: Charlie-George/cgat

def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--index=gene_id")

Beispiel #25

Datei anzeigen

def buildRefcodingGeneSet(infiles, outfile):
    '''
    builds a refcoding geneset based on the genes that are present in
    the abinitio assembly
    '''
    PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)