def convert_to_xlsx(args): """ covert the .tsv file to xlsx """ samples_df = pd.read_csv(args.samples, comment="#", sep="\t") column_names = [name.capitalize() for name in list(samples_df.columns)] samples_sheet = [] for row in samples_df.itertuples(index=False, name='Pandas'): method = getattr(row, "method") condition = getattr(row, "condition") replicate = getattr(row, "replicate") fastq = getattr(row, "fastqFile") fastq = fastq.split("/")[1] ntup = [method, condition, replicate, fastq] if len(samples_df.columns) == 5: fastq2 = getattr(row, "fastqFile2") fastq2 = fastq2.split("/")[1] ntup.append(fastq2) samples_sheet.append(ntup) samples_df = pd.DataFrame.from_records(samples_sheet, columns=column_names) sheets = {"samples": samples_df} eu.excel_writer(args.output, sheets, [])
def create_excel_sheets(args): excel_sheet_dict = {} # read the genome file genome_file = SeqIO.parse(args.genome_path, "fasta") genome_dict = dict() for entry in genome_file: genome_dict[str(entry.id)] = (str(entry.seq), str(entry.seq.complement())) # get the total mapped reads for each bam file total_mapped_dict = {} with open(args.total_mapped, "r") as f: total = f.readlines() wildcards = [] for line in total: wildcard, reference_name, value = line.strip().split("\t") total_mapped_dict[(wildcard, reference_name)] = int(value) wildcards.append(wildcard) wildcards = eu.get_unique(wildcards) te_header = eu.get_te_header(wildcards) conditions = [] for card in wildcards: conditions.append(card.split("-")[1]) conditions = eu.get_unique(conditions) contrasts = sorted( [f"{x}-{y}" for x, y in list(iter.combinations(conditions, 2))]) excel_sheet_dict = create_cds_excel_sheet(args, excel_sheet_dict, genome_dict, total_mapped_dict, wildcards, conditions, contrasts, te_header) excel_sheet_dict = create_misc_excel_sheet(args, excel_sheet_dict, genome_dict, total_mapped_dict, wildcards, conditions, contrasts, te_header) eu.excel_writer(args.output_path, excel_sheet_dict, wildcards)
def create_excel_file(args): # read the genome file genome_file = SeqIO.parse(args.genome, "fasta") genome_dict = dict() for entry in genome_file: genome_dict[str(entry.id)] = (str(entry.seq), str(entry.seq.complement())) # get the total mapped reads for each bam file total_mapped_dict = {} with open(args.total_mapped, "r") as f: total = f.readlines() wildcards = [] for line in total: wildcard, chromosome, value = line.strip().split("\t") total_mapped_dict[(wildcard, chromosome)] = int(value) wildcards.append(wildcard) wildcards = eu.get_unique(wildcards) te_header = eu.get_te_header(wildcards) conditions = [] for card in wildcards: conditions.append(card.split("-")[1]) conditions = eu.get_unique(conditions) #read bed file read_df = pd.read_csv(args.reads, comment="#", header=None, sep="\t") # read gff file cds_sheet = [] header = ["Identifier", "Genome", "Source", "Feature", "Start", "Stop", "Strand", "Pred_probability", "Locus_tag", "Old_locus_tag", "Name", "Length", "Codon_count"] + [cond + "_TE" for cond in te_header] + [card + "_rpkm" for card in wildcards] + ["Evidence", "Start_codon", "Stop_codon", "15nt upstream", "Nucleotide_seq", "Aminoacid_seq"] prefix_columns = len(read_df.columns) - len(wildcards) name_list = ["s%s" % str(x) for x in range(len(header))] nTuple = collections.namedtuple('Pandas', name_list) for row in read_df.itertuples(index=False, name='Pandas'): chromosome = getattr(row, "_0") source = getattr(row, "_1") feature = getattr(row, "_2") start = getattr(row, "_3") stop = getattr(row, "_4") strand = getattr(row, "_6") attributes = getattr(row, "_8") start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information(genome_dict[chromosome], start-1, stop-1, strand) pred_value, name, product, note, evidence, locus_tag, old_locus_tag = eu.retrieve_column_information(attributes) length = stop - start + 1 codon_count = int(length / 3) read_list = [getattr(row, "_%s" %x) for x in range(prefix_columns,len(row))] rpkm_list = [] for idx, val in enumerate(read_list): rpkm_list.append(eu.calculate_rpkm(total_mapped_dict[(wildcards[idx], chromosome)], val, length)) te_list = eu.calculate_te(rpkm_list, wildcards, conditions) identifier = "%s:%s-%s:%s" % (chromosome, start, stop, strand) result = [identifier, chromosome, "reparation", feature, start, stop, strand, pred_value, locus_tag, old_locus_tag, name, length, codon_count] + te_list + rpkm_list + [evidence, start_codon, stop_codon, nt_window, nucleotide_seq, aa_seq] cds_sheet.append(nTuple(*result)) cds_df = pd.DataFrame.from_records(cds_sheet, columns=[header[x] for x in range(len(header))]) cds_df = cds_df.sort_values(by=["Genome", "Start", "Stop"]) dataframe_dict = { "CDS" : cds_df } eu.excel_writer(args.output_path, dataframe_dict, wildcards)
def parse_orfs(args): with open(args.total_mapped, "r") as f: total = f.readlines() wildcards = [] for line in total: wildcard, reference_name, value = line.strip().split("\t") wildcards.append(wildcard) wildcards = get_unique(wildcards) #read bed file read_df = pd.read_csv(args.reads, comment="#", header=None, sep="\t") # read gff file main_sheet = [] header = ["Orientation", "Class", "Feature count"] + wildcards prefix_columns = len(read_df.columns) - len(wildcards) name_list = ["s%s" % str(x) for x in range(len(header))] nTuple = collections.namedtuple('Pandas', name_list) decode = { "srna": "sRNA", "5'-utr": "5'-UTR", "cds": "CDS", "rrna": "rRNA", "trna": "tRNA", "transcript": "transcript", "pseudogene": "pseudogene", "total": "total" } feature_list = [ "srna", "5'-utr", "cds", "rrna", "trna", "transcript", "pseudogene", "total" ] read_dict = collections.OrderedDict() count_dict = collections.OrderedDict() for f in feature_list: read_dict[f] = [0] * len(wildcards) count_dict[f] = 0 main_sheet = [] for row in read_df.itertuples(index=False, name='Pandas'): reference_name = getattr(row, "_0") start = getattr(row, "_3") stop = getattr(row, "_4") feature = getattr(row, "_2") read_list = [ getattr(row, "_%s" % x) for x in range(prefix_columns, len(row)) ] if feature.lower() in read_dict: for idx, value in enumerate(read_list): read_dict[feature.lower()][idx] += value read_dict["total"][idx] += value count_dict[feature.lower()] += 1 count_dict["total"] += 1 else: print("feature not usable: " + feature) for key, val in read_dict.items(): result = ["sense", decode[key], count_dict[key] ] + [float("%.2f" % v) for v in val] main_sheet.append(nTuple(*result)) main_df = pd.DataFrame.from_records( main_sheet, columns=[header[x] for x in range(len(header))]) dataframe_dict = {"Main": main_df} eu.excel_writer(args.output, dataframe_dict, wildcards)
def xtail_output(args): # read the genome file genome_file = SeqIO.parse(args.genome, "fasta") genome_dict = dict() for entry in genome_file: genome_dict[str(entry.id)] = (str(entry.seq), str(entry.seq.complement())) annotation_dict = annotation_to_dict(args.annotation_file) diff_expr_df = pd.read_csv(args.input_csv, sep=",", comment="#") all_sheet = [] header = [ "Genome", "Start", "Stop", "Strand", "Locus_tag", "Old_locus_tag", "ID", "Name", "mRNA_log2FC", "RPF_log2FC", "log2FC_TE_v1", "pvalue_v1", "log2FC_TE_v2", "pvalue_v2", "log2FC_TE_final", "pvalue_final", "pvalue.adjust", "Length", "Codon_count", "Start_codon", "Stop_codon", "Nucleotide_seq", "Aminoacid_seq" ] name_list = [f"s{x}" for x in range(len(header))] nTuple = collections.namedtuple('Pandas', name_list) for row in diff_expr_df.itertuples(index=False, name='Pandas'): cds_id = getattr(row, "_0") if cds_id in annotation_dict: chromosome, start, stop, strand, attributes, locus_tag, old_locus_tag = annotation_dict[ cds_id] column_info = retrieve_column_information(attributes) else: if ":" in cds_id and "-" in cds_id: chromosome, sec, strand = cds_id.split(":") start, stop = sec.split("-") column_info = ["", "", ""] locus_tag, old_locus_tag = "", "" else: sys.exit("Error... ID is not novel and not in the annotation!") mRNA_log2FC = getattr(row, "mRNA_log2FC") RPF_log2FC = getattr(row, "RPF_log2FC") log2FC_TE_v1 = getattr(row, "log2FC_TE_v1") pvalue_v1 = getattr(row, "pvalue_v1") log2FC_TE_v2 = getattr(row, "log2FC_TE_v2") pvalue_v2 = getattr(row, "pvalue_v2") log2FC_TE_final = getattr(row, "log2FC_TE_final") pvalue_final = getattr(row, "pvalue_final") pvalue_adjust = getattr(row, "_9") start = int(start) stop = int(stop) length = stop - start + 1 codon_count = int(length / 3) start_codon, stop_codon, nucleotide_seq, aa_seq = get_genome_information( genome_dict[chromosome], start - 1, stop - 1, strand) result = [ chromosome, start, stop, strand, locus_tag, old_locus_tag, cds_id, column_info[0], mRNA_log2FC, RPF_log2FC, log2FC_TE_v1, pvalue_v1, log2FC_TE_v2, pvalue_v2, log2FC_TE_final, pvalue_final, pvalue_adjust, length, codon_count, start_codon, stop_codon, nucleotide_seq, aa_seq ] all_sheet.append(nTuple(*result)) all_df = pd.DataFrame.from_records( all_sheet, columns=[header[x] for x in range(len(header))]) dataframe_dict = {"all": all_df} eu.excel_writer(args.output, dataframe_dict, [])