Exemple #1
0
def classifyLncRNA(infiles, outfile):
    '''

    Classify lncRNA realtive to protein coding loci

    Classify lincRNA in terms of their relationship to
    protein coding genes - creates indices for intervals on the
    fly - mayb should be creating additional annotations:

    antisense
       transcript overlapping protein coding exons on opposite strand
    antisense_upstream
       transcript < 2kb from tss on opposite strand
    antisense_downstream
       transcript < 2kb from gene end on opposite strand
    sense_upstream
       transcript < 2kb from tss on same strand
    sense_downstream
       transcript < 2kb from gene end on same strand
    intergenic
       transcript >2kb from any protein coding gene
    intronic
       overlaps protein coding gene intron on same strand
    antisense_intronic
       overlaps protein coding intron on opposite strand
    '''

    PipelineLncRNA.classifyLncRNAGenes(infiles[0],
                                       infiles[1],
                                       outfile,
                                       dist=PARAMS["lncrna_dist"])
Exemple #2
0
def flagExonStatus(infile, outfile):
    '''
    Adds two attributes to the gtf entry:
    exon_status_locus - specifies whether the gene model is multi- or
    single-exon
    exon_status - specifies whether the transcript is mult- or single exon
    '''

    PipelineLncRNA.flagExonStatus(infile, outfile)
Exemple #3
0
def classifyFilteredLncRNA(infiles, outfile):
    '''
    classifies all lincRNA before cpc filtering to define any classes that
    are represented in the coding set that are  filtered
    NOTE: This task is not included when running the full pipeline
    '''
    PipelineLncRNA.classifyLncRNAGenes(infiles[0],
                                       infiles[1],
                                       outfile,
                                       dist=PARAMS["lncrna_dist"])
Exemple #4
0
def splitLncRNAFasta(infile, outfiles):
    out_dir = "./phyloCSF/lncrna_fasta"

    name_dict = {}
    for mapping in PARAMS["phyloCSF_map_species_names"].split(","):
        pair = mapping.split(":")
        key = ">" + pair[0]
        value = ">" + pair[1]
        name_dict[key] = value
    E.info("Name mapping: %s" % name_dict)

    PipelineLncRNA.splitAlignedFasta(infile, out_dir, name_dict)
Exemple #5
0
def buildRefnoncodingGeneSet(infile, outfile):
    '''
    filter the refnoncoding geneset for things that are described in ensembl
    as being:
    Ambiguous_orf
    Retained_intron
    Sense_intronic
    antisense
    Sense_overlapping
    Processed transcript
    '''
    PipelineLncRNA.buildRefnoncodingGeneSet(infile, outfile)
Exemple #6
0
 def buildFilteredLncRNAGeneSet(infiles, outfile):
     '''
     Creates a filtered lncRNA geneset. That contains previously identified
     gene models supplied in contig file.
     '''
     assert PARAMS["filtering_remove_single_exon"] in [
         "loci", "transcripts", None
     ]
     PipelineLncRNA.buildFilteredLncRNAGeneSet(
         infiles[0],
         outfile,
         infiles[1:len(infiles)],
         filter_se=PARAMS["filtering_remove_single_exon"])
Exemple #7
0
def buildLncRNAGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''
    outf = open(outfile, "w")
    outf.write("\t".join([
        "no_transcripts", "no_genes", "no_exons_per_transcript",
        "no_exons_per_gene", "no_single_exon_transcripts",
        "no_multi_exon_transcripts", "no_single_exon_genes",
        "no_multi_exon_genes"
    ]) + "\n")

    # For pep8 purposes
    x = list(
        map(str, [
            PipelineLncRNA.CounterTranscripts(infile).count(),
            PipelineLncRNA.CounterGenes(infile).count(),
            PipelineLncRNA.CounterExonsPerTranscript(infile).count(),
            PipelineLncRNA.CounterExonsPerGene(infile).count(),
            PipelineLncRNA.CounterSingleExonTranscripts(infile).count(),
            PipelineLncRNA.CounterMultiExonTranscripts(infile).count(),
            PipelineLncRNA.CounterSingleExonGenes(infile).count(),
            PipelineLncRNA.CounterMultiExonGenes(infile).count()
        ]))
    outf.write("\t".join(x))
Exemple #8
0
def buildFinalLncRNAGeneSet(infile, outfile):
    '''
    the final lncRNA gene set consists of transcripts that pass
    the initial filtering stage i.e. are;
    multi-exonic/previously seen single exon transcripts
    display low evidence for coding potential
    '''

    # filter based on coding potential and rename
    PipelineLncRNA.buildFinalLncRNAGeneSet(infile,
                                           "lncrna_filtered_cpc_result",
                                           outfile, PARAMS["filtering_cpc"],
                                           PARAMS["filtering_cpc_threshold"],
                                           PARAMS["final_geneset_rename"])
Exemple #9
0
def buildLncRNAGeneSet(infiles, outfile):
    '''
    build lncRNA gene set.

    This is a set of transcripts in the abinitio set that
    do not overlap at any protein coding or pseudogene transcripts
    or additional biotypes from ensembl that are unwanted
    (exons) in a reference gene set.

    Transcripts need to have a length of at least 200 bp.
    '''
    PipelineLncRNA.buildLncRNAGeneSet(infiles[0], infiles[1], infiles[2],
                                      infiles[3], infiles[4], outfile,
                                      PARAMS["lncrna_min_length"])
Exemple #10
0
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
Exemple #11
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_dir"],
                                  PARAMS["annotations_interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS["annotations_interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;" " gzip %(removed)s")
    P.run()
Exemple #12
0
def buildCodingGeneSet(infiles, outfile):
    '''
    takes the output from cuffcompare of a transcript
    assembly and filters for annotated protein coding
    genes.

    NB "pruned" refers to nomenclature in the transcript
    building pipeline - transcripts that appear in at least
    two samples.

    Because an abinitio assembly will often contain
    fragments of known transcripts and describe them as
    novel, the default behaviour is to produce a set that
    is composed of 'complete' or 'contained' transcripts
    i.e. nothing novel. This may underestimate the number
    of transcripts that are actually expressed
    '''
    PipelineLncRNA.buildCodingGeneSet(infiles[0], infiles[1], outfile)
Exemple #13
0
def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes.
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)

    outf = iotools.openFile(outfile, "w")
    outf.write("\t".join([
        "no_transcripts", "no_genes", "no_exons_per_transcript",
        "no_exons_per_gene", "no_single_exon_transcripts",
        "no_multi_exon_transcripts", "no_single_exon_genes",
        "no_multi_exon_genes"
    ]) + "\n")
    outf.write("\t".join(
        map(str, [
            PipelineLncRNA.CounterTranscripts(tmpf).count(),
            PipelineLncRNA.CounterGenes(tmpf).count(),
            PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(),
            PipelineLncRNA.CounterExonsPerGene(tmpf).count(),
            PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(),
            PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(),
            PipelineLncRNA.CounterSingleExonGenes(tmpf).count(),
            PipelineLncRNA.CounterMultiExonGenes(tmpf).count()
        ])))

    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))
Exemple #14
0
def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
Exemple #15
0
def convertControlGTFToBed12(infile, outfile):
    """
    Convert either ensembl lincRNA, or control gtf to bed12 format
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")
Exemple #16
0
def buildRefcodingGeneSet(infiles, outfile):
    '''
    builds a refcoding geneset based on the genes that are present in
    the abinitio assembly
    '''
    PipelineLncRNA.buildRefcodingGeneSet(infiles[1], infiles[0], outfile)
Exemple #17
0
def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--add-index=gene_id")
Exemple #18
0
def convertGTFToBed12(infile, outfile):
    """
    Transform the lncrna_final.gtf.gz into lncrna_final.bed
    """
    PipelineLncRNA.gtfToBed12(infile, outfile, "transcript")