Beispiel #1
0
def process2(R1, R2, output, adapter, threads, libpath, mapper, minlen, trim5, counts, rRNA):

	file_name = R1.split("/")[-1].split("_")[0]
	outdir = os.path.join(output, file_name)

	### make directory
	if not os.path.exists(outdir):
		try:
			os.makedirs(outdir)
		except Exception as e:
			pass

	prefix = os.path.join(outdir, file_name)

	out_R1_p = prefix + "_R1.fq.gz"
	out_R1_u = prefix + "_R1_unpaired.gz"
	out_R2_p = prefix + "_R2.fq.gz"
	out_R2_u = prefix + "_R2_unpaired.gz"

	out_log = prefix + "_trimmomatic.log"

	print("\n%s  Processing: %s, %s" % (current_time(), R1,R2))

	realpath = sys.path[0]
	### trimmomatic
	subprocess.call("trimmomatic PE -threads %d -phred33 %s %s %s %s %s %s ILLUMINACLIP:%s/../library/adapter/%s:1:30:10:5 SLIDINGWINDOW:4:20 MINLEN:%d HEADCROP:%d 2> %s" % (threads, R1, R2, out_R1_p, out_R1_u, out_R2_p, out_R2_u, realpath, adapter, minlen, trim5, out_log), shell=True)
	
	### Mapping by hisat2
	if mapper == 'hisat2':
		SummaryFile = prefix + "_hisat_summary.txt"
		MapOut = prefix + "_hisat_sort.bam"
		subprocess.call("hisat2 -p %d -x %s/genome_tran -1 %s -2 %s -U %s,%s -t --dta --summary-file %s --new-summary|samtools sort -@ %d -m 10G -o %s" % (threads, libpath, out_R1_p, out_R2_p, out_R1_u, out_R2_u, SummaryFile, threads, MapOut), shell=True)
	
	### Mapping by STAR
	elif mapper == 'STAR':
		STARprefix = prefix + "_STAR_"
		subprocess.call("STAR --runThreadN %d --outSAMtype BAM SortedByCoordinate --genomeDir %s --readFilesIn %s %s --readFilesCommand zcat --outFileNamePrefix %s --quantMode GeneCounts --outFilterScoreMinOverLread 0.1 --outFilterMatchNminOverLread 0.1 --outFilterMatchNmin 0  --outFilterMismatchNmax 2" % (threads, libpath, out_R1_p, out_R2_p, STARprefix), shell=True)		

		MapOut = prefix + "_STAR_Aligned.sortedByCoord.out.bam" ## sorted bam file
		
	### Asemble by stringtie
	print("%s Asemble ..." % current_time())
	stringtieGTF = prefix + '_stringtie.gtf'
	stringtieGene = prefix + '_gene_abund.tab'
	subprocess.call("stringtie %s -e -G %s/annotation.gtf -p %d -o %s -A %s" % (MapOut, libpath, threads, stringtieGTF, stringtieGene), shell=True)
	
	### Gene counts
	if counts:
		countOut = prefix + '_gene_counts.txt'
		subprocess.call("featureCounts -a %s/annotation.gtf -o %s %s -t exon -g gene_name -T %d -Q 30 -p" % (libpath, countOut, MapOut, threads), shell=True)
	
	### rRNA
	if rRNA:
		rapvis_rRNA.rRNA(R1, R2, output, threads)
Beispiel #2
0
def merge_profiles(name, output):

    while True:

        n = GetRunningTasks(name)
        if n == 0:
            print("%s, Merging profiles ... " % current_time())
            files = glob.glob("%s/*/*gene_abund.tab" % (output))

            if files:
                files = sorted(files)
                dict_merge = {}
                for f in files:
                    with open(f) as handle:
                        for line in islice(handle, 1, None):
                            line = line.strip().split("\t")
                            k_map = line[1]
                            k_RNA = f.split("/")[-2]
                            count = float(line[8])  # TPM
                            if k_map in dict_merge:
                                dict_merge[k_map][k_RNA] = count

                            else:
                                tmp_dic = {}
                                tmp_dic[k_RNA] = count
                                dict_merge[k_map] = tmp_dic

                df = DataFrame(dict_merge).T
                df = df.fillna(value=0)  ### fill NA to 0
                df_sum = DataFrame(df.sum(axis=1), columns=['sum'])
                df = df.join(df_sum)
                df = df.sort_values(by="sum", ascending=False)  ### sort by sum
                df.drop(['sum'], axis=1, inplace=True)
                merge_out = os.path.join(output, "merge_gene_TPM.txt")
                df.to_csv(merge_out,
                          sep="\t",
                          header=True,
                          index=True,
                          index_label="gene",
                          float_format="%.2f")
                return merge_out
                break

            else:
                print(
                    "\n### Merge profiles failed, it is not exsit in %s/*/ \n"
                    % (output))
                exit(1)
        else:
            print("%s, Waitiing for task finished, remaining %d tasks" %
                  (current_time(), n))
            time.sleep(10)
Beispiel #3
0
def merge_gene_counts(output):
    print("%s, Merging Gene Counts ... " % current_time())
    files = glob.glob("%s/*/*gene_counts.txt" % (output))
    try:
        files = sorted(files)
        dict_merge = {}
        for f in files:
            with open(f) as handle:
                for line in islice(handle, 2, None):
                    line = line.strip().split("\t")
                    k_map = line[0]
                    k_RNA = f.split("/")[-2]
                    count = int(line[6])  # count
                    if k_map in dict_merge:
                        dict_merge[k_map][k_RNA] = count

                    else:
                        tmp_dic = {}
                        tmp_dic[k_RNA] = count
                        dict_merge[k_map] = tmp_dic

        df = DataFrame(dict_merge).T
        df = df.fillna(value=0)  ### fill NA to 0
        df_sum = DataFrame(df.sum(axis=1), columns=['sum'])
        df = df.join(df_sum)
        df = df[df['sum'] > 0]
        df = df.sort_values(by="sum", ascending=False)  ### sort by sum
        df.drop(['sum'], axis=1, inplace=True)
        merge_out2 = os.path.join(output, "merge_gene_counts.txt")
        df.to_csv(merge_out2,
                  sep="\t",
                  header=True,
                  index=True,
                  index_label="gene",
                  float_format="%.0f")
        #return merge_out2
    except Exception as e:
        print("\n### Merge Gene Counts failed, it is not exsit in %s/*/ \n" %
              (output))
Beispiel #4
0
def gene_dis(fi, output, libpath):

    print("%s, Caculating gene expression pattern ... " % current_time())

    data = pd.read_table(fi, header=0)
    prefix = os.path.join(output, 'merge_gene_TPM')
    ###
    data_melt = data.melt('gene', var_name='sample')
    data_melt = data_melt.query('value>0')
    data_melt.index = data_melt['gene']

    ### Gene species by gene type
    gene_type = {}
    with open("%s/gene_type.txt" % libpath) as f:
        x = str(data_melt.index[0])
        if x.startswith("ENS"):
            for line in f:
                line = line.strip().split("\t")
                gene_type[line[1]] = line[3]
        else:
            for line in f:
                line = line.strip().split("\t")
                gene_type[line[2]] = line[3]

    gene_type = pd.Series(gene_type, name='gene_type', dtype="string")

    type_list = ["protein_coding", "pseudogene", "lincRNA", "antisense"]
    for i in range(0, len(gene_type.index)):
        if gene_type[i] in type_list:
            pass
        elif re.search("pseudogene", gene_type[i]):
            gene_type[i] = 'pseudogene'

        else:
            gene_type[i] = 'others'

    # Categories
    data_melt2 = pd.merge(data_melt,
                          gene_type,
                          how='left',
                          sort=False,
                          right_index=True,
                          left_index=True)
    cat_type = CategoricalDtype(categories=data.columns[1:], ordered=True)
    data_melt2['sample'] = data_melt2['sample'].astype(cat_type)

    # set width and height
    width = int(data.shape[0])
    height = 6
    fontsize = 15
    if width >= 8:
        width = math.log(width, 2) * 2  ### adjust the width of barplot
    else:
        width = width / 1.5

    aspect = width / width
    #aspect = int(data.shape[1])
    #if aspect >3:
    #aspect = np.log(aspect) - 1
    #if aspect >1:
    #	aspect = np.log(aspect)
    #else :
    #	aspect = aspect

    colors = list(reversed(sns.color_palette()[0:5]))
    hue_order = [
        "others", "pseudogene", "antisense", "lincRNA", "protein_coding"
    ]
    sns.displot(data_melt2,
                x="sample",
                hue="gene_type",
                hue_order=hue_order,
                palette=colors,
                multiple="stack",
                shrink=.8,
                height=height,
                aspect=aspect)
    plt.xticks(rotation=90)
    plt.xlabel('Samples', fontsize=fontsize)
    plt.ylabel('Gene species', fontsize=fontsize)

    out_box = prefix + "_species_type.pdf"
    plt.savefig(out_box, bbox_inches='tight')
    plt.close()
    '''
	### Gene species
	sns.displot(data_melt, x="sample", shrink=.8, height=height, aspect=aspect)
	plt.xticks(rotation=90)
	plt.xlabel('Samples', fontsize=fontsize)
	plt.ylabel('Gene numbers', fontsize=fontsize)

	out_box = prefix + "_species.pdf"
	plt.savefig(out_box, bbox_inches='tight')
	plt.close()
	'''

    ### Gene species by expression interval
    values = pd.cut(
        data_melt['value'], [0, 1, 5, 10, 50, 100, 1000, 1000000],
        labels=['0~1', '1~5', '5~10', '10~50', '50~100', '100~1000', '>1000'])
    data_melt = data_melt.copy()  ### For SettingWithCopyWarning
    data_melt['ExpressionInterval'] = values
    #data_melt.loc[:,'ExpressionInterval'] = values

    sns.displot(data_melt,
                x="sample",
                hue="ExpressionInterval",
                multiple="stack",
                shrink=.8,
                height=height,
                aspect=aspect)
    plt.xticks(rotation=90)
    plt.xlabel('Samples', fontsize=fontsize)
    plt.ylabel('Gene species', fontsize=fontsize)

    out_box = prefix + "_species_EI.pdf"
    plt.savefig(out_box, bbox_inches='tight')
    plt.close()

    ### expression density
    #data_melt['log2value'] = np.log2(data_melt['value'])
    #sns.displot(data=data_melt, x="log2value", kind="kde", hue='sample', height=4, aspect=1.4, common_norm=False)
    sns.kdeplot(data=data_melt,
                x="value",
                hue='sample',
                log_scale=True,
                common_norm=False)
    plt.xlabel('log10(TPM)', fontsize=15)
    out_box = prefix + "_density.pdf"
    plt.savefig(out_box, bbox_inches='tight')
    plt.close()
Beispiel #5
0
        '-trim5',
        default=0,
        type=int,
        metavar='N',
        help='remove N bases from the begining of each read (default:0)')
    parser.add_argument('--counts',
                        action='store_true',
                        help='Get gene counts')
    parser.add_argument('--rRNA',
                        action='store_true',
                        help='whether mapping to rRNA(Human)')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s 0.0.2')

    args = parser.parse_args()

    print("\n%s ..... Start RNAseq processing" % (current_time()))
    start_time = time.time()

    process(args.input, args.output, args.adapter, args.threads,
            args.libraryPath, args.mapper, args.minlen, args.trim5,
            args.counts, args.rRNA)

    ###
    end_time = time.time()
    run_time = round((end_time - start_time) / 60, 5)
    print("\n%s ..... Finished all. Used time: %s m\n" %
          (current_time(), run_time))
Beispiel #6
0
def SubmitTask(fi, output, adapter, threads, libpath, mapper, tasks, name,
               minlen, trim5, queue, counts, rRNA):
    '''
	submit tasks to the server
	'''
    ### get the data with fastq format
    files = []
    fAll = glob.glob("%s/*" % fi)
    for f in fAll:
        if f.endswith('fastq') or f.endswith('fastq.gz') or f.endswith(
                'fq.gz') or f.endswith('fq'):
            files.append(f)
    files = sorted(files)
    f_index = list(np.arange(0, len(files), 2))

    f_num = 0
    for i in f_index:

        while True:
            ### get task number
            n = GetRunningTasks(name)
            ### check task number
            if n >= tasks:
                time.sleep(10)
                print("%s, Submitted Tasks: %d, total: %d" %
                      (current_time(), f_num, len(f_index)))

            else:

                f_num += 1  # samples number
                R1 = files[i]
                R2 = files[i + 1]

                tmp = "tmp.sh"
                f = open(tmp, 'w')
                f.write("#!/bin/bash\n")
                f.write("#$ -o %s.o\n" % name)
                f.write("#$ -e %s.e\n" % name)

                jobName = name + '.' + str(f_num)
                f.write("#$ -N %s\n" % jobName)

                f.write("source ~/.bashrc\n")
                f.write("source ~/.bash_profile\n")

                realpath = sys.path[0]
                '''
				f.write("python %s/rapvis_process.py -f1 %s -f2 %s -o %s -a %s -p %d -lib %s -m %s --minlen %d --trim5 %d\n" %(realpath, R1, R2, output, adapter, threads, libpath, mapper, minlen, trim5))
				
				if rRNA:
					f.write("python %s/rapvis_rRNA.py -f1 %s -f2 %s -o %s -p %d\n" % (realpath, R1, R2, output, threads))
				'''

                rRNA = '--rRNA' if rRNA else ''
                counts = '--counts' if counts else ''

                f.write(
                    "python %s/rapvis_process.py -R1 %s -R2 %s -o %s -a %s -p %d -lib %s -m %s --minlen %d --trim5 %d %s %s\n"
                    % (realpath, R1, R2, output, adapter, threads, libpath,
                       mapper, minlen, trim5, counts, rRNA))
                f.close()

                subprocess.call("qsub -cwd -q %s %s" % (queue, tmp),
                                shell=True)
                #subprocess.call("qsub -cwd -l node=4 -q %s %s" % (queue, tmp), shell=True)
                #subprocess.call("qsub -cwd -l mem_free=150G -q %s %s" % (queue, tmp), shell=True)
                subprocess.call("rm %s" % tmp, shell=True)
                time.sleep(1)

                break