コード例 #1
0
    if not args.prefix:
        prefix = ""
    else:
        prefix = args.prefix + "-"
    outfilename = prefix + "cnvs.csv"
    # Prep the output file.

    with open(outfilename, "w") as outfile:
        MQ.runTime("# CNV VCF parsing", outfile)
        MQ.PWS("# VCF File:      " + args.input, outfile)
        MQ.PWS("# Samples File:  " + args.samples, outfile)
        MQ.PWS("# Output prefix: " + args.prefix, outfile)
        MQ.PWS("# Output file:   " + outfilename, outfile)
        MQ.PWS("# ----------", outfile)

        MQ.PWS("# " + MQ.getDateTime() + " Reading sample info...", outfile)
        samples = cnvlib.readSamples(args.samples)
        MQ.PWS("# " + MQ.getDateTime() + " Samples read: " + str(len(samples)),
               outfile)
        # Read the sample info.

        MQ.PWS("# " + MQ.getDateTime() + " Reading VCF...", outfile)
        vcf, vcf_headers, vcf_format, human_flag = cnvlib.readVCF(args.input)
        MQ.PWS("# " + MQ.getDateTime() + " Variants read: " + str(len(vcf)),
               outfile)
        # Read the VCF file.

        MQ.PWS("# " + MQ.getDateTime() + " Fixing VCF headers...", outfile)
        vcf_headers = [h.replace("NHP-", "") for h in vcf_headers]
        if "39239A" in vcf_headers:
            vcf_headers[vcf_headers.index("39239A")] = "39239"
コード例 #2
0
    MQ.PWS("# Gene file:       " + gene_file, logfile)
    MQ.PWS("# 10kb up file:    " + gene_up_file, logfile)
    MQ.PWS("# 10kb down file:  " + gene_down_file, logfile)
    MQ.PWS("# Transcript file: " + transcript_file, logfile)
    MQ.PWS("# Exon file:       " + exon_file, logfile)
    MQ.PWS("# ----------------", logfile)
    # I/O options and info.

    feature_types = [
        "gene", "gene-10kb-up", "gene-10kb-down", "transcript", "exon"
    ]
    cnv_types = ["del", "dup"]
    overlap_types = ["full", "partial"]
    # Categories for features, CNVs, and overlaps.

    MQ.PWS("# " + MQ.getDateTime() + " Reading CNVs...", logfile)
    cnvs = []
    for line in open(cnvs_file):
        cnv = line.strip().split("\t")[3]
        cnv_len = int(cnv.split(":")[3])
        if cnv_len < max_cnv_len:
            cnvs.append(cnv)
    num_cnvs = len(cnvs)
    MQ.PWS("# CNVs read: " + str(num_cnvs), logfile)

    MQ.PWS("\n# " + MQ.getDateTime() + " Counting gene overlaps...", logfile)
    countOverlaps(cnvs, gene_file, "GENES")

    MQ.PWS(
        "\n# " + MQ.getDateTime() + " Counting 10kb upstream gene overlaps...",
        logfile)
コード例 #3
0
    MQ.runTime("# Fisher's test for GO enrichment", outfile)
    MQ.PWS(MQ.spacedOut("# Query file:", io_pad) + queryfile, outfile)
    MQ.PWS(MQ.spacedOut("# Background file:", io_pad) + bgfile, outfile)
    MQ.PWS(
        MQ.spacedOut("# Alpha (p-value threshold):", io_pad) + str(alpha),
        outfile)
    MQ.PWS(
        MQ.spacedOut("# Multiple test correction method:", io_pad) +
        correction_str, outfile)
    if correction == "None":
        MQ.PWS("# --> WARNING: Not correcting for multiple tests!", outfile)
    MQ.PWS(MQ.spacedOut("# Output file:", io_pad) + outfilename, outfile)
    MQ.PWS("# ----------", outfile)
    # Report I/O information.

    MQ.PWS("# " + MQ.getDateTime() + " Counting total query GO terms...",
           outfile)
    query_genes = []
    query_go_count = 0
    for line in open(queryfile):
        if line[0] == "#":
            continue
        line = line.strip().split("\t")
        gid = line[0]
        if gid not in query_genes:
            query_genes.append(gid)
        query_go_count += 1
    # Get count of query GO terms and unique list of features.

    MQ.PWS("# " + MQ.getDateTime() + " Counting total background GO terms...",
           outfile)
コード例 #4
0
gofile = "go/" + species + "-go-terms-uniq.txt"
transcripts_file = "go/" + species + "-noalu-transcripts-" + mode + ".txt"
# The input files: A GO term database from Ensembl (with identical lines removed with bash's sort | uniq commands)
# and a file containing transcript IDs that overlap CNVs (from 06_gene_count_bed.py)

queryoutfile = "go/" + species + "-cnv-" + mode + "-go-query.tab"
bgoutfile = "go/" + species + "-cnv-" + mode + "-go-bg.tab"

MQ.PWS("# Go file          : " + gofile)
MQ.PWS("# Transcripts file : " + transcripts_file)
MQ.PWS("# Mode             : " + mode)
MQ.PWS("# Query out        : " + queryoutfile)
MQ.PWS("# BG out           : " + bgoutfile)
MQ.PWS("# ----------------")

MQ.PWS("# " + MQ.getDateTime() + " Getting annotation info...")
transcript_go = {}
go_accs = {}
first = True
for line in gzip.open(gofile):
    if first:
        first = False
        continue

    line = line.decode().strip().split("\t")
    #print(line);

    chrome = line[4]
    if chrome == "MT":
        continue
コード例 #5
0
    dumpfile = "../cafe-data/dump.out.mlemur-blast.txt.I30"
    overlap_file = "bed/" + species + "-cnvs-filtered-noalu.csv"
    gtf_file = "gtf/" + species + "-chromes.gtf"
    overlap_file = "bed/" + species + "-cnvs-to-transcripts.bed"
    outfilename = "../cafe-data/" + species + "-cafe-genes" + ".csv"
    # File names.

    MQ.PWS("# Log file:    " + logfilename, logfile)
    MQ.PWS("# GTF file:    " + gtf_file, logfile)
    MQ.PWS("# Overlap file:" + overlap_file, logfile)
    MQ.PWS("# Dump file:   " + dumpfile, logfile)
    MQ.PWS("# Output file: " + outfilename, logfile)
    MQ.PWS("# ----------------", logfile)
    # I/O options and info.

    MQ.PWS("# " + MQ.getDateTime() + " Reading features from GTF...", logfile)
    ttp = defaultdict(str)
    overlaps = {}
    for line in open(gtf_file):
        if line[0] == "#":
            continue
        line = line.strip().split("\t")
        feature_type, chrome, start, end, feature_info = line[2], line[0], int(
            line[3]), int(line[4]), line[8]
        if feature_type == "CDS":
            tid = re.findall('ENS' + regstr + 'T[\d]+', feature_info)[0]
            pid = re.findall('ENS' + regstr + 'P[\d]+', feature_info)[0]

            if pid not in ttp[tid]:
                ttp[tid] = pid
                overlaps[tid] = {