Esempio n. 1
0
def Cufflinks_mask_GTF(assembly):
    chromInfo = kent.fetch_ucsc_table(assembly, "chromInfo")

    chrM_size = 0
    masks = []

    for line in chromInfo:
        if not line.strip(): continue
        line_split = line.strip().split("\t")
        
        if line_split[0] == "chrM":
            chrM_size = int(line_split[1])
        
        masks.append(kent.fetch_ucsc_table(assembly, line_split[0] + "_rmsk"))
    
    masks.append(kent.fetch_ucsc_table(assembly, "rmsk"))
    
    rmsk = sum(masks, [])
    
    del masks
    
    output = []
    maskid = 0
    
    rmsk.append("\t".join(["."] * 5 + \
                          ["real_chrM", "0", str(chrM_size),
                           ".", "+", ".", "rRNA"]))
    rmsk.append("\t".join(["."] * 5 + \
                          ["real_chrM", "0", str(chrM_size),
                           ".", "-", ".", "rRNA"]))
    
    for line in rmsk:
        if not line.strip(): continue
        line_split = line.strip().split("\t")
        
        if line_split[11] not in ("tRNA", "rRNA"): continue
        if line_split[5] == "chrM": continue
        if line_split[5] == "real_chrM":
            line_split[5] = "chrM"
    
        maskid += 1
    
        GTF_line = [line_split[5],
                    "rmsk",
                    "exon",
                    line_split[6],
                    line_split[7],
                    "0.000000",
                    line_split[9],
                    ".",
                    "gene_id \"MASK%06d\"; transcript_id \"MASK%06d\";" % (maskid, maskid)]
                    
        output.append("\t".join(GTF_line))
        
    return output
Esempio n. 2
0
def Cufflinks_mask_GTF(assembly):
    chromInfo = kent.fetch_ucsc_table(assembly, "chromInfo")

    chrM_size = 0
    masks = []

    for line in chromInfo:
        if not line.strip(): continue
        line_split = line.strip().split("\t")

        if line_split[0] == "chrM":
            chrM_size = int(line_split[1])

        masks.append(kent.fetch_ucsc_table(assembly, line_split[0] + "_rmsk"))

    masks.append(kent.fetch_ucsc_table(assembly, "rmsk"))

    rmsk = sum(masks, [])

    del masks

    output = []
    maskid = 0

    rmsk.append("\t".join(["."] * 5 + \
                          ["real_chrM", "0", str(chrM_size),
                           ".", "+", ".", "rRNA"]))
    rmsk.append("\t".join(["."] * 5 + \
                          ["real_chrM", "0", str(chrM_size),
                           ".", "-", ".", "rRNA"]))

    for line in rmsk:
        if not line.strip(): continue
        line_split = line.strip().split("\t")

        if line_split[11] not in ("tRNA", "rRNA"): continue
        if line_split[5] == "chrM": continue
        if line_split[5] == "real_chrM":
            line_split[5] = "chrM"

        maskid += 1

        GTF_line = [
            line_split[5], "rmsk", "exon", line_split[6], line_split[7],
            "0.000000", line_split[9], ".",
            "gene_id \"MASK%06d\"; transcript_id \"MASK%06d\";" %
            (maskid, maskid)
        ]

        output.append("\t".join(GTF_line))

    return output
Esempio n. 3
0
def Cufflinks_knownGene_GTF(assembly):
    kgXref = kent.fetch_ucsc_table(assembly, "kgXref")
    knownIsoforms = kent.fetch_ucsc_table(assembly, "knownIsoforms")
    knownGene = kent.fetch_ucsc_gtf(assembly, "knownGene")

    ucscid_to_xref = {}

    for line in kgXref:
        if line.startswith("#"): continue
        line_split = line.strip().split("\t")
        ucscid_to_xref[line_split[0]] = {
            "mRNA": line_split[1],
            "gene_symbol": line_split[4],
            "protein_id": line_split[6]
        }

    ucscid_to_clusterid = {}

    for line in knownIsoforms:
        if line.startswith("#"): continue
        line_split = line.strip().split("\t")
        ucscid_to_clusterid[line_split[1]] = int(line_split[0])

    output = []

    for line in knownGene:
        if not line.strip(): continue
        line_split = line.strip().split("\t", 8)

        ucscid = line_split[8].split("\"")[1]
        xref = ucscid_to_xref[ucscid]
        clusterid = ucscid_to_clusterid[ucscid]

        line = "\t".join(line_split[:8]) + "\t" + \
               " ".join(["gene_id \"CLUST%05d\";" % clusterid,
                         "transcript_id \"%s\";" % ucscid,
                         "gene_name \"%s\";" % xref["gene_symbol"],
                         "transcript_name \"%s\";" % xref["mRNA"],
                         "protein_id \"%s\";" % xref["protein_id"] if xref["protein_id"] != "" else ""])

        output.append(line)

    return output
Esempio n. 4
0
def Cufflinks_knownGene_GTF(assembly):
    kgXref = kent.fetch_ucsc_table(assembly, "kgXref")
    knownIsoforms = kent.fetch_ucsc_table(assembly, "knownIsoforms")
    knownGene = kent.fetch_ucsc_gtf(assembly, "knownGene")
    
    ucscid_to_xref = {}
    
    for line in kgXref:
        if line.startswith("#"): continue
        line_split = line.strip().split("\t")
        ucscid_to_xref[line_split[0]] = {"mRNA": line_split[1],
                                         "gene_symbol": line_split[4],
                                         "protein_id": line_split[6]}
                                         
    ucscid_to_clusterid = {}

    for line in knownIsoforms:
        if line.startswith("#"): continue
        line_split = line.strip().split("\t")
        ucscid_to_clusterid[line_split[1]] = int(line_split[0])

    output = []

    for line in knownGene:
        if not line.strip(): continue
        line_split = line.strip().split("\t", 8)
    
        ucscid = line_split[8].split("\"")[1]
        xref = ucscid_to_xref[ucscid]
        clusterid = ucscid_to_clusterid[ucscid]
    
        line = "\t".join(line_split[:8]) + "\t" + \
               " ".join(["gene_id \"CLUST%05d\";" % clusterid,
                         "transcript_id \"%s\";" % ucscid,
                         "gene_name \"%s\";" % xref["gene_symbol"],
                         "transcript_name \"%s\";" % xref["mRNA"],
                         "protein_id \"%s\";" % xref["protein_id"] if xref["protein_id"] != "" else ""])

        output.append(line)
        
    return output