def multi_align():
    #eon_info_key = "{}_{}N{}Y".format(ref_exp_region, multi_best_hit_N_str, multi_best_hit_Y_str)
    # multi_align_indi_dict[ref_region][vcf_fname_key] = [ref_exp_region, iso_region, consensus_fasta_name]
    for ref_info in multi_align_indi_dict.keys():
        (ref_region, ref_e_region) = ref_info
        print("region2: " + ref_region + " " + ref_e_region)
        out_fasta_fname = "{}.fasta".format(ref_region)
        multi_align_outF = "{}_{}.clustalo_num".format(prefix, ref_region)
        ## write ref to fasta file
        command("echo {} >{}".format(ref_e_region,
                                     tmp_region_fname)).run_comm(0)
        command("samtools faidx -r {} {} >{}".format(
            tmp_region_fname, REF_FASTA, out_fasta_fname)).run_comm(0)
        command("sed -i \"1s/.*/>ref_{}/\" {}".format(
            ref_e_region, out_fasta_fname)).run_comm(0)
        for vcf_file_key in sorted(multi_align_indi_dict[ref_info].keys()):
            (iso_e_region, iso_concensus_fasta
             ) = multi_align_indi_dict[ref_info][vcf_file_key]
            ## write isolates to fasta file
            iso_fasta_lines_str = get_region_seq(iso_e_region,
                                                 iso_concensus_fasta)
            iso_fasta_lines = replace_the_first_line(
                iso_fasta_lines_str, ">{}_{}".format(vcf_file_key,
                                                     iso_e_region))
            command("echo \"{}\" >> {}".format(iso_fasta_lines,
                                               out_fasta_fname)).run_comm(0)
        ## do alignment for each expanded region
        command(
            "clustalo --infile {} --threads 8 --verbose --outfmt clustal --resno --outfile {} --output-order input-order --seqtype dna --force"
            .format(out_fasta_fname, multi_align_outF)).run_comm(0)
Exemple #2
0
def qc_by_quast():
    global qc_files
    qc_files = ["report.txt", "report.tsv", "report.pdf", "report.html"]
    REF_FASTA = "{}/../../ref/{}.fasta".format(workdir, genome_name)
    GFF3 = "{}/../../ref/{}.gff".format(workdir, genome_name)
    command("quast {} -r {} -g {} -o {}".format(scaffold_fasta, REF_FASTA,
                                                GFF3, workdir)).run_comm(0)
Exemple #3
0
def get_gff_from_genome_name():
    global genome_gff_fname
    if genome == "cryptosporidium_hominis":
        command("cp -p {} .".format(prop.get_attrib("ch_gff"))).run_comm(0)
        genome_gff_fname = "GCA_002223825.1_C.hominis.v1_genomic.gff"
    else:
        genome_gff_fname = misc.download("gff3", genome, "")
def run_trim_galore(fastqfiles):
    global fqout1
    global fqout2
    global tg_out_files
    tg_out_files = []
    fastq1_name = os.path.basename(fastq1)
    fastq1_name_base = fastq1_name.rstrip(".fastq")
    report1 = os.path.join(workdir, fastq1_name + "_trimming_report.txt")
    if fastq2 is None:
        fqout1 = os.path.join(workdir, fastq1_name_base + "_trimmed.fq")
        tg_out_files = [fqout1, report1]
    else:
        fqout1 = os.path.join(workdir, fastq1_name_base + "_val_1.fq")
        fastq2_name = os.path.basename(fastq2)
        fastq2_name_base = fastq2_name.rstrip(".fastq")
        fqout2 = os.path.join(workdir, fastq2_name_base + "_val_2.fq")
        report2 = os.path.join(workdir, fastq2_name + "_trimming_report.txt")
        tg_out_files = [fqout1, report1, fqout2, report2]
    #pair fastq files
    if len(fastqfiles) == 2:
        command("trim_galore --paired -q 20 " + fastqfiles[0] + " " +
                fastqfiles[1]).run_comm(0)
    #single fastq files
    elif len(fastqfiles) == 1:
        command("trim_galore -q 20 " + fastqfiles[0]).run_comm(0)
def vcf_analysis():
    global multi_align_indi_dict
    iso_name = "NA"
    multi_align_indi_dict = defaultdict(lambda: defaultdict(str))
    for vcf_fpath in (vcf_files):
        vcf_fname_key = re.findall("(.*)\.", os.path.basename(vcf_fpath))[0]
        compressed_vcf_name = vcf_fname_key + ".bgzip"
        consensus_fasta_name = vcf_fname_key + ".consensus.fasta"
        command("bgzip -c {} > {}".format(vcf_fpath,
                                          compressed_vcf_name)).run_comm(0)
        command("tabix {} -f".format(compressed_vcf_name)).run_comm(0)
        command("cat {} | bcftools consensus {} > {}".format(
            REF_FASTA, compressed_vcf_name, consensus_fasta_name)).run_comm(0)
        command(
            "makeblastdb -in {}  -parse_seqids -dbtype nucl -out {}.DBblast".
            format(consensus_fasta_name, vcf_fname_key)).run_comm(0)
        blast_out_fname = vcf_fname_key + ".ref.exp.blast_out"
        command(
            "blastn -query {} -db {}.DBblast -dust no -outfmt 7 -max_target_seqs 1 -out {}"
            .format(ref_exp_region_fasta, vcf_fname_key,
                    blast_out_fname)).run_comm(0)
        #for map_key in mapping_dict.keys():
        #    if re.search(map_key, consensus_fasta_name):
        #        iso_name = mapping_dict[map_key]
        #if iso_name == "NA":
        # iso_name = re.findall("(.*?)_IbA10G2", consensus_fasta_name)[0]
        exp_region_analysis(vcf_fname_key, blast_out_fname,
                            consensus_fasta_name)
def get_info_from_gff3():
    global gff3_dict
    global gene_bed_fPath
    gff3_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(str)))
    gene_bed_dict = defaultdict(
        lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(str))))
    REF_GFF = "{}/../ref/{}.gff".format(workdir, genome_name)
    fh_gff3 = open(REF_GFF, "r")
    gene_bed_fPath = "{}/gene.bed".format(workdir)
    tmp_fPath = "{}/tmp.bed".format(workdir)
    fh_tmp = open(tmp_fPath, "w")
    for line in fh_gff3:
        if not re.search('^#', line):
            (chrom, mol_type, start, end, strand, phase,
             gene_part_ori) = getVar(line.split(), [0, 2, 3, 4, 6, 7, 8])
            if mol_type == 'gene':
                gene_name = re.findall("ID=(.*?);", gene_part_ori)[0]
                fh_tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    chrom, start, end, strand, phase, gene_name))
            if mol_type == 'CDS':
                gene_name = re.findall("Parent=(.*?)-", gene_part_ori)[0]
            if mol_type == 'gene' or mol_type == 'CDS':
                gff3_dict[chrom][mol_type][(start, end)] = gene_name
    fh_gff3.close()
    fh_tmp.close()
    command("sort -k1,1 -k2,2n {} > {}".format(tmp_fPath,
                                               gene_bed_fPath)).run_comm(0)
    command("rm {}".format(tmp_fPath)).run_comm(0)
def run_QC():
    global fastqc_out_files
    global fastqc_out_files_str
    global multiqc_out_prefix
    global sample_runID
    ## define variables
    if mapping_file is not None:
        sample = MISC.get_samples_by_runIDs(mapping_file)[runID]
        sample_runID = "{}_{}".format(sample, runID)
    else:
        sample_runID = runID
    fastqc_out_files = []
    fastqc_out_files_str = ""
    if fastq2 is not None:
        infiles = [fq_ori1, fqout1, fq_ori2, fqout2]
    else:
        infiles = [fq_ori1, fqout1]
    ## fastQC
    for infile in infiles:
        fastqc_out_files.append(fastQC(infile))
    for fastqc_out_file in fastqc_out_files:
        fastqc_out_files_str += fastqc_out_file + " "
    fastqc_out_files_str = fastqc_out_files_str.rstrip(" ")
    ## multiQC
    multiqc_out_prefix = sample_runID + ".multiQC"
    command("multiqc -f {} -o {} --filename {} -v".format(
        fastqc_out_files_str, workdir, multiqc_out_prefix)).run_comm(0)
def run_trim_galore(fastqfiles):
    global fastq1_name
    global fastq1_name_base
    global fastq2_name
    global fastq2_name_base
    global fqout1
    global fqout2
    fastq1_name = os.path.basename(fastq1)
    fastq1_name_base = fastq1_name.rstrip(".fastq")
    if fastq2 is None:
        fqout1 = os.path.join(workdir, fastq1_name_base + "_trimmed.fq")
    else:
        fqout1 = os.path.join(workdir, fastq1_name_base + "_val_1.fq")
        fastq2_name = os.path.basename(fastq2)
        fastq2_name_base = fastq2_name.rstrip(".fastq")
        fqout2 = os.path.join(workdir, fastq2_name_base + "_val_2.fq")

    #check whether qc_software path was defined in the property file
    qc_sw_path = prop.get_attrib(qc_sw)

    #pair fastq files
    if len(fastqfiles) == 2:
        command(qc_sw_path + " --paired -q 20 " + fastqfiles[0] + " " +
                fastqfiles[1]).run_comm(0)
    #single fastq files
    elif len(fastqfiles) == 1:
        command(qc_sw_path + " -q 20 " + fastqfiles[0]).run_comm(0)
def write_gene_matrix_summary():
    global alt_isolates_nums
    global sample_num
    global gene_matrix_summary_file
    gene_matrix_summary_file = "{}_gene_summary.csv".format(prefix)
    sample_num=len(sample_names)
    q25=str(int(0.25*sample_num))
    q50=str(int(0.5*sample_num))
    q75=str(int(0.75*sample_num))
    alt_isolates_nums=[q25,q50,q75]
    ann_types=["dN","dS"]
    fPath_out="{}_gene_summary_ori.csv".format(prefix)
    fileout=open(fPath_out,'w')
    # write the column names
    fileout.write("{}\t{}\t{}\t{}\t{}".format("GENE","CHROMOSOME","STRAT","END","LENGTH"))
    for go_col in go_col_names:
        fileout.write("\t{}".format(go_col))
    for num_range in alt_isolates_nums:    
        for ann_type in ann_types:
            fileout.write("\t{}_locusVar_num(>={}_isolate(s))\t{}_locusVar_num(>={}_isolate(s))/kb"
                          .format(ann_type,num_range,ann_type,num_range))
        fileout.write("\tdN/dS(>={}_isolate(s))".format(num_range))
    fileout.write("\n")
    # write the content 
    for gene in sorted(gene_variant_dict.keys()):
        fileout.write("{}".format(gene))
        fileout.write("\t{}\t{}\t{}\t{}".format(genome_gene_dict[gene][0],genome_gene_dict[gene][1],
                                                genome_gene_dict[gene][2],genome_gene_dict[gene][3]
                                               ))
        for each_go in go_dict[gene]:
            fileout.write("\t{}".format(each_go))
        fileout.write("\t{}\n".format(get_str(gene_locus[gene],genome_gene_dict[gene][3])))
    fileout.close()
    cmd = "cat {} | awk '{{split($0,a,\"\\t\");if(a[14]!=0 || a[16]!=0) print $0}}' >{}"
    command(cmd.format(fPath_out, gene_matrix_summary_file)).run_comm(0)
def run_vcf_merge():
    global merged_vcf
    variants_str = get_variant_str()
    merged_vcf = workdir + "/" + prefix + "_merged.vcf"
    command(
        "gatk -T CombineVariants -R {} {} -o {} -genotypeMergeOptions UNIQUIFY"
        .format(REF_FASTA, variants_str, merged_vcf)).run_comm(0)
Exemple #11
0
def post_process():
    print("post_processing...")
    out_files = ["scaffolds.fasta", "spades.log"]
    for out_file in out_files:
        FI.copy_file_add_prefix(out_file, outdir, prefix + "_")
    for qcfile in qc_files:
        FI.copy_file_to_destdir(qcfile, qcdir)
    command("cp -p {}.multiQC*.html {}".format(prefix, qcdir)).run_comm(0)
def create_upset_matrix():
    global upset_mat_file
    upset_mat_file = "{}_upset.mat".format(prefix)
    command(
        "infoseq -sequence {} -only -name -length -outfile {}/genome.size -nohead -auto"
        .format(REF_FASTA, workdir)).run_comm(0)
    command(
        "bedtools multiinter -g {}/genome.size -emtpy -header -i {} >{}/{}".
        format(workdir, vcf_files_str, workdir, upset_mat_file)).run_comm(0)
    change_fileName_to_isoName(upset_mat_file)
def run_snpEff():
    global ann_vcf_fpaths
    ann_vcf_fpaths = []
    snpEff = prop.get_attrib("snpeff")
    for vcf_fpath in all_vcf_fpaths:
        vcf_fpath_prefix = os.path.basename(vcf_fpath).rstrip(".vcf")
        ann_vcf_fpath = vcf_fpath_prefix + ".ann.vcf"
        ann_vcf_fpaths.append(ann_vcf_fpath)
        command("java -jar {} -c snpEff.config {} {} > {}".format(
            snpEff, genome, vcf_fpath, ann_vcf_fpath)).run(0)
Exemple #14
0
 def run_snpEff(self):
     print("run annotation...")
     global fName_ann_vcf
     annot_sw = "snpeff"
     snpEff = self.prop.get_attrib("snpeff")
     snpeff_db = snpEff_db(self.properties_file, self.genome_name)
     snpeff_db.build_snpeff_db()
     fName_ann_vcf = self.prefix + ".ann.vcf"
     command("java -jar {} -c snpEff.config {} {} > {}".format(
         snpEff, self.genome_name, fName_str_vcf,
         fName_ann_vcf)).run_comm(0)
def fastqc(bam):
    global fastqc_file
    global bam_for_fastqc
    #sample_runID already exists in the bam file name
    bam_for_fastqc = os.path.basename(bam).replace(
        "_grouped_dedup", "").replace("_grouped",
                                      "").replace(prefix_ori + "_", "")
    FI.copy_file(bam, "{}/{}".format(workdir, bam_for_fastqc))
    command("fastqc -o {} --noextract -f bam_mapped {}".format(
        workdir, bam_for_fastqc)).run_comm(0)
    fastqc_file = bam_for_fastqc.replace(".bam", "_fastqc.zip")
Exemple #16
0
def generate_cluster3():
    global cluster_fname
    genomeAnalysisTK = prop.get_attrib("gatk")
    command(
        "java -jar {} -T CombineVariants -R {} {} -o {} -genotypeMergeOptions UNIQUIFY"
        .format(genomeAnalysisTK, genome_fasta, get_variant_str(),
                prefix)).run_comm(0)
    plink = prop.get_attrib("plink")
    command("{} --vcf {} -cluster --allow-extra-chr -out {}".format(
        plink, prefix, prefix)).run_comm(0)
    cluster_fname = prefix + ".cluster3"
def check_seq_hunN(region):
    command("echo {} >{}".format(region, tmp_region_fname)).run_comm(0)
    region_fasta_str_ori = command("samtools faidx -r {} {}".format(
        tmp_region_fname, REF_FASTA)).run_comm(1).decode("utf-8").rstrip()
    region_fasta_str = region_fasta_str_ori.replace("\n", "")
    print("region_fasta_str=" + region + "\n" + region_fasta_str)
    if re.search(sixtyNstr, region_fasta_str):
        print("here 60N:" + region)
        return 1
    else:
        return 0
def deduplication():
    global deduped_fqs
    fq_dedup_out1 = fqout1.replace(".fq", ".dedup.fq")
    command("clumpify in={} out={} dedupe=t".format(fqout1,
                                                    fq_dedup_out1)).run_comm(0)
    deduped_fqs = [fq_dedup_out1]
    if fastq2 is not None:
        fq_dedup_out2 = fqout2.replace(".fq", ".dedup.fq")
        command("clumpify in={} out={} dedupe=t".format(
            fqout2, fq_dedup_out2)).run_comm(0)
        deduped_fqs.append(fq_dedup_out2)
def deduplication():
    global fq_dedup_out1
    global fq_dedup_out2
    rm_dup_sw_path = prop.get_attrib(rm_dup_sw)
    fq_dedup_out1 = fqout1.replace(".fq", ".dedup.fq")
    command("{} in={} out={} dedupe=t".format(rm_dup_sw_path, fqout1,
                                              fq_dedup_out1)).run_comm(0)

    if fastq2 is not None:
        fq_dedup_out2 = fqout2.replace(".fq", ".dedup.fq")
        command("{} in={} out={} dedupe=t".format(rm_dup_sw_path, fqout2,
                                                  fq_dedup_out2)).run_comm(0)
def multiqc():
    global multiQC_outFN
    global fastqc_html
    if mapping_tool == "bowtie2":
        multiQC_input = fastqc_file + " " + bowtie2_log_for_multiqc
    else:
        multiQC_input = fastqc_file
    multiQC_input += " " + qualimap_dir
    multiQC_outFN = "{}.multiQC".format(sample_runID)
    command("multiqc -f {} -o {} --filename {} -v".format(
        multiQC_input, workdir, multiQC_outFN)).run_comm(0)
    fastqc_html = "{}_{}_fastqc.html".format(prefix_ori, sample_runID)
def run_snpEff():
    global ann_vcf_fpaths
    annot_sw = "snpeff"
    ann_vcf_fpaths = []
    snpEff = prop.get_attrib(annot_sw)
    genome_db = {"ch": "c_hominis", "cp": "c_parvum"}
    for vcf_fpath in all_vcf_fpaths:
        vcf_fpath_prefix = os.path.basename(vcf_fpath).rstrip(".vcf")
        ann_vcf_fpath = vcf_fpath_prefix + ".ann.vcf"
        ann_vcf_fpaths.append(ann_vcf_fpath)
        command("java -jar {} -c snpEff.config {} {} > {}".format(
            snpEff, genome_db[genome], vcf_fpath, ann_vcf_fpath)).run_comm(0)
def add_group(fastqfiles):
    global grouped_bam
    global picard
    grouped_bam = "{}/{}_grouped.bam".format(workdir, sample_runID)
    fq_id_pat = "^\@(.*?)\."
    first_line_fq1 = command("head -n 1 " +
                             fq1).run_comm(1).decode("utf-8").rstrip()
    fq_id = re.findall(fq_id_pat, first_line_fq1)[0]
    cmd_str = "picard AddOrReplaceReadGroups I={} O={} RGID={} RGPU={} RGSM={} RGLB={} RGPL={} VALIDATION_STRINGENCY=LENIENT"
    command(
        cmd_str.format(bam_sorted, grouped_bam, fq_id, "NA", fq_id,
                       dna_library, platform)).run_comm(0)
def post_process():
    print("post_processing...")
    fastqc_postfix = "_fastqc.zip"
    for tg_out_file in tg_out_files:
        FI.copy_file_add_prefix(tg_out_file, outdir, prefix + "_")
    for fastqc_out_file in fastqc_out_files:
        FI.copy_file_to_destdir(fastqc_out_file, qcdir)
        FI.copy_file_to_destdir(fastqc_out_file.replace(".zip", ".html"),
                                qcdir)
    command("cp -p {}.html {}".format(multiqc_out_prefix, qcdir)).run_comm(0)
    if if_dedup:
        for deduped_fq in deduped_fqs:
            FI.copy_file_add_prefix(deduped_fq, outdir, prefix + "_")
def report():
    print "report..."
    fh_report = open("report", 'w')
    for codeml_output in codeml_outputs:
        omega = command("grep omega {}".format(codeml_output)).run_comm(1)
        if re.search("\d+", omega):
            fh_report.write(omega)
            accs_str = command("grep '^#' {} | awk '{{print $2}}'".format(
                codeml_output)).run_comm(1)
            accs = accs_str.rstrip().split()
            for acc in accs:
                fh_report.write("{}_{}\n".format(
                    acc, fasta_genome_map[acc_fasta_map[acc]]))
            fh_report.write("\n")
    fh_report.close()
def get_intersect():
    global intersect_fPath
    global jaccard_fPath
    global fhout_intersect
    global fhout_jaccard
    vcf_dict = {}
    intersect_fPath = workdir + "/intersect.matrix"
    jaccard_fPath = workdir + "/jaccard.matrix"
    fhout_intersect = open(intersect_fPath, 'w')
    fhout_jaccard = open(jaccard_fPath, 'w')
    out_files_write("name")
    for vcf in vcf_files:
        vcf_dict[vcf] = MISC.get_runID(vcf)
        out_files_write(" " + vcf_dict[vcf])
    out_files_write("\n")
    for vcf1 in vcf_files:
        out_files_write(vcf_dict[vcf1])
        for vcf2 in vcf_files:
            (intersect, jaccard) = getVar(
                command(
                    "bedtools jaccard -a {} -b {} |cut -f1,3|grep -v jaccard".
                    format(vcf1,
                           vcf2)).run_comm(1).decode("utf-8").rstrip().split(),
                [0, 1])  #overlaps
            fhout_intersect.write(" " + intersect)
            fhout_jaccard.write(" " + jaccard)
        out_files_write("\n")
    fhout_intersect.close()
    fhout_jaccard.close()
    change_fileName_to_isoName(intersect_fPath)
    change_fileName_to_isoName(jaccard_fPath)
def parse_mat():
    headline = command("head -n 1 {}".format(upset_mat_file)).run_comm(
        1).decode("utf-8").rstrip()
    eles = headline.split()
    new_headline = ""
    for ele in eles[:5]:
        new_headline += ele + "\t"
    for ele in eles[5:]:
        ele = re.sub("^.*/", "", ele)
        new_headline += ele + "\t"
    new_headline = new_headline.rstrip("\t")
    command("mv {} {}.tmp".format(upset_mat_file, upset_mat_file)).run_comm(0)
    command("sed '1s/.*/{}/' {}.tmp >{}".format(
        new_headline, upset_mat_file, upset_mat_file)).run_comm(
            0)  # remove dir of each sample from the head line
    change_fileName_to_isoName(upset_mat_file)
def ref_region_analsis():
    global ref_exp_region_dict
    global ref_exp_region_fasta
    global ann_dict
    global sixtyNstr
    sixtyNstr = get_60Nstr()
    ref_exp_region_dict = {}
    ann_dict = {}

    # tandem repeats finder to detect tandem repeat in genome fasta file,output is TRF_out_name
    # parameter details, see *1, output table explanation, see *2
    command("trf {} 2 7 7 80 10 50 500 -f -d -m".format(REF_FASTA)).run_comm(0)
    # convert it to the format that STRViper will read in, output explanation see *2, not only parsing,
    # but also skip some regions as better quality regions related
    ## define file names
    trf_out_name = os.path.basename(REF_FASTA) + ".2.7.7.80.10.50.500.dat"
    fName_str = "{}.trf.str".format(genome_name)
    exp_ref_region_fname = "{}.ref.exp.regions".format(prefix)
    fhout_exp_ref_region = open(exp_ref_region_fname, "w")
    ref_exp_region_fasta = "{}.ref.exp.fasta".format(prefix)
    tmp_bed_fPath = "{}/tmp.bed".format(workdir)
    fh_bed_tmp = open(tmp_bed_fPath, "w")
    ## run command
    command("jsat parseTRF --input {} --output {} --format str".format(
        trf_out_name, fName_str)).run_comm(0)
    cmd_str = "grep -v '^#' {} | grep -v '^$' | awk '{{print $1\" \"$2\" \"$3\" \"$4\" \"$5}}'"
    ref_regions_str = command(cmd_str.format(fName_str)).run_comm(1).decode(
        "utf-8").rstrip().split("\n")
    for ref_region_str in ref_regions_str:
        (ref_chrom, ref_start, ref_end, ref_str_unit_len,
         ref_unit_num_ori) = getVar(ref_region_str.split(), [0, 1, 2, 3, 4])
        ref_unit_num = str(int(float(ref_unit_num_ori)))
        (exp_ref_region,
         exp_ref_region_len) = get_exp_ref_region(ref_chrom, ref_start,
                                                  ref_end,
                                                  chrom_len_dict[ref_chrom])
        ref_region = "{}:{}-{}".format(ref_chrom, ref_start, ref_end)
        ref_len = str(abs(int(ref_start) - int(ref_end)) + 1)
        cmd = "grep '^[0-9]' {} | awk '{{if ($1=={} && $2=={} && $3=={}) print $14}}'"
        ref_str_seq = command(
            cmd.format(trf_out_name, ref_start, ref_end,
                       ref_str_unit_len)).run_comm(1).decode("utf-8").rstrip()
        ref_exp_region_dict[exp_ref_region] = (exp_ref_region_len, ref_region,
                                               ref_str_seq, ref_str_unit_len,
                                               ref_len, ref_unit_num)
        fhout_exp_ref_region.write(exp_ref_region + "\n")
        ann_dict[ref_region] = get_region_ann(ref_region)
        print("ann_dict1={} {}".format(ann_dict[ref_region][0],
                                       ann_dict[ref_region][1]))
        fh_bed_tmp.write("{}\t{}\t{}\t{}:{}:{}\t.\t.\n".format(
            ref_chrom, ref_start, ref_end, ref_str_seq, ref_str_unit_len,
            ref_unit_num))
    fhout_exp_ref_region.close()
    fh_bed_tmp.close()
    command("samtools faidx -r {} {} -o {}".format(
        exp_ref_region_fname, REF_FASTA, ref_exp_region_fasta)).run_comm(0)
    get_closest_region_and_gene(tmp_bed_fPath)
def download(genome):
    ftp_dir = ""
    cDNA_fasta_fname = ""
    ftp_root_dir = "ftp://ftp.ensemblgenomes.org/pub/current/protists/fasta/"
    sub_dirs1 = command("curl -s {} | awk '{{print $9}}'".format(
        ftp_root_dir)).run_comm(1).split()
    if genome.lower() in sub_dirs1:
        ftp_dir = "{}/{}/cdna/".format(ftp_root_dir, genome.lower())
    else:
        for sub_dir1 in sub_dirs1:
            sub_dirs2 = command("curl -s {}/ | awk '{{print $9}}'".format(
                ftp_root_dir + sub_dir1)).run_comm(1).split()
            if genome.lower() in sub_dirs2:
                ftp_dir = "{}/{}/{}/cdna/".format(ftp_root_dir, sub_dir1,
                                                  genome.lower())
    if ftp_dir == "":
        print "can not find cDNA fasta file for {}".format(genome)
        sys.exit(1)
    else:
        cDNA_fasta_gz = command(
            "curl -s {} | awk '{{print $9}}' | grep cdna".format(
                ftp_dir)).run_comm(1)
        cDNA_fasta_gz = cDNA_fasta_gz.rstrip()
        command("curl -o {} {}".format(cDNA_fasta_gz,
                                       ftp_dir + cDNA_fasta_gz)).run_comm(0)
        cDNA_fasta_fname = cDNA_fasta_gz.rstrip(".gz")
        command("gunzip -c {} > {}".format(
            cDNA_fasta_gz, cDNA_fasta_fname)).run_comm_no_exit(1)
        fasta_genome_map[cDNA_fasta_fname] = genome
    return cDNA_fasta_fname
def get_chrom_len():
    global chrom_len_dict
    chrom_len_dict = {}
    chro_ori_lines = command("grep '>' {}".format(REF_FASTA)).run_comm(
        1).decode("utf-8").rstrip().split("\n")
    for chro_ori_line in chro_ori_lines:
        (chrom, chr_len) = re.findall(">(.*?) .*?length=(\d+) ",
                                      chro_ori_line)[0]
        chrom_len_dict[chrom] = chr_len
def run_snpEff():         
    global ann_stat_fpath
    global ann_stat_fpaths
    global ann_vcf_fpaths
    ann_vcf_fpaths=[]
    ann_stat_fpaths=[]
    snpeff_db=snpEff_db(properties_file,genome_name)    
    snpeff_db.build_snpeff_db()
    for vcf_fpath in all_vcf_fpaths:
        runID=MISC.get_runID(vcf_fpath)
        vcf_prefix="{}_{}".format(prefix, os.path.basename(vcf_fpath).rstrip(".vcf"))
        ann_vcf_fpath=vcf_prefix+"_ann.vcf"
        ann_vcf_fpaths.append(ann_vcf_fpath)
        if mapping_file is not None:
            ann_stat_fpath="{}_{}.ann_stats".format(mirror[runID],runID)
        else:           
            ann_stat_fpath=runID+".ann_stats"
        ann_stat_fpaths.append(ann_stat_fpath)
        command("snpEff -c snpEff.config {} {} -csvStats {} > {}".format(genome_name, vcf_fpath, ann_stat_fpath, ann_vcf_fpath)).run(0);