Beispiel #1
0
def convert_to_xlsx(args):
    """
    covert the .tsv file to xlsx
    """

    samples_df = pd.read_csv(args.samples, comment="#", sep="\t")

    column_names = [name.capitalize() for name in list(samples_df.columns)]

    samples_sheet = []
    for row in samples_df.itertuples(index=False, name='Pandas'):
        method = getattr(row, "method")
        condition = getattr(row, "condition")
        replicate = getattr(row, "replicate")
        fastq = getattr(row, "fastqFile")
        fastq = fastq.split("/")[1]
        ntup = [method, condition, replicate, fastq]

        if len(samples_df.columns) == 5:
            fastq2 = getattr(row, "fastqFile2")
            fastq2 = fastq2.split("/")[1]
            ntup.append(fastq2)

        samples_sheet.append(ntup)

    samples_df = pd.DataFrame.from_records(samples_sheet, columns=column_names)

    sheets = {"samples": samples_df}

    eu.excel_writer(args.output, sheets, [])
def create_excel_sheets(args):
    excel_sheet_dict = {}

    # read the genome file
    genome_file = SeqIO.parse(args.genome_path, "fasta")
    genome_dict = dict()
    for entry in genome_file:
        genome_dict[str(entry.id)] = (str(entry.seq),
                                      str(entry.seq.complement()))

    # get the total mapped reads for each bam file
    total_mapped_dict = {}
    with open(args.total_mapped, "r") as f:
        total = f.readlines()

    wildcards = []
    for line in total:
        wildcard, reference_name, value = line.strip().split("\t")
        total_mapped_dict[(wildcard, reference_name)] = int(value)
        wildcards.append(wildcard)

    wildcards = eu.get_unique(wildcards)

    te_header = eu.get_te_header(wildcards)

    conditions = []
    for card in wildcards:
        conditions.append(card.split("-")[1])

    conditions = eu.get_unique(conditions)
    contrasts = sorted(
        [f"{x}-{y}" for x, y in list(iter.combinations(conditions, 2))])

    excel_sheet_dict = create_cds_excel_sheet(args, excel_sheet_dict,
                                              genome_dict, total_mapped_dict,
                                              wildcards, conditions, contrasts,
                                              te_header)
    excel_sheet_dict = create_misc_excel_sheet(args, excel_sheet_dict,
                                               genome_dict, total_mapped_dict,
                                               wildcards, conditions,
                                               contrasts, te_header)

    eu.excel_writer(args.output_path, excel_sheet_dict, wildcards)
def create_excel_file(args):
    # read the genome file
    genome_file = SeqIO.parse(args.genome, "fasta")
    genome_dict = dict()
    for entry in genome_file:
        genome_dict[str(entry.id)] = (str(entry.seq), str(entry.seq.complement()))

    # get the total mapped reads for each bam file
    total_mapped_dict = {}
    with open(args.total_mapped, "r") as f:
        total = f.readlines()

    wildcards = []
    for line in total:
        wildcard, chromosome, value = line.strip().split("\t")
        total_mapped_dict[(wildcard, chromosome)] = int(value)
        wildcards.append(wildcard)

    wildcards = eu.get_unique(wildcards)

    te_header = eu.get_te_header(wildcards)

    conditions = []
    for card in wildcards:
        conditions.append(card.split("-")[1])

    conditions = eu.get_unique(conditions)

    #read bed file
    read_df = pd.read_csv(args.reads, comment="#", header=None, sep="\t")

    # read gff file
    cds_sheet = []

    header = ["Identifier", "Genome", "Source", "Feature", "Start", "Stop", "Strand", "Pred_probability", "Locus_tag", "Old_locus_tag", "Name", "Length", "Codon_count"] + [cond + "_TE" for cond in te_header] + [card + "_rpkm" for card in wildcards] + ["Evidence", "Start_codon", "Stop_codon", "15nt upstream", "Nucleotide_seq", "Aminoacid_seq"]
    prefix_columns = len(read_df.columns) - len(wildcards)
    name_list = ["s%s" % str(x) for x in range(len(header))]
    nTuple = collections.namedtuple('Pandas', name_list)

    for row in read_df.itertuples(index=False, name='Pandas'):
        chromosome = getattr(row, "_0")
        source = getattr(row, "_1")
        feature = getattr(row, "_2")
        start = getattr(row, "_3")
        stop = getattr(row, "_4")
        strand = getattr(row, "_6")
        attributes = getattr(row, "_8")

        start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information(genome_dict[chromosome], start-1, stop-1, strand)
        pred_value, name, product, note, evidence, locus_tag, old_locus_tag = eu.retrieve_column_information(attributes)

        length = stop - start + 1
        codon_count = int(length / 3)

        read_list = [getattr(row, "_%s" %x) for x in range(prefix_columns,len(row))]
        rpkm_list = []
        for idx, val in enumerate(read_list):
            rpkm_list.append(eu.calculate_rpkm(total_mapped_dict[(wildcards[idx], chromosome)], val, length))

        te_list = eu.calculate_te(rpkm_list, wildcards, conditions)

        identifier = "%s:%s-%s:%s" % (chromosome, start, stop, strand)
        result = [identifier, chromosome, "reparation", feature, start, stop, strand, pred_value, locus_tag, old_locus_tag, name, length, codon_count] + te_list + rpkm_list + [evidence, start_codon, stop_codon, nt_window, nucleotide_seq, aa_seq]


        cds_sheet.append(nTuple(*result))

    cds_df = pd.DataFrame.from_records(cds_sheet, columns=[header[x] for x in range(len(header))])
    cds_df = cds_df.sort_values(by=["Genome", "Start", "Stop"])

    dataframe_dict = { "CDS" : cds_df }

    eu.excel_writer(args.output_path, dataframe_dict, wildcards)
def parse_orfs(args):

    with open(args.total_mapped, "r") as f:
        total = f.readlines()

    wildcards = []
    for line in total:
        wildcard, reference_name, value = line.strip().split("\t")
        wildcards.append(wildcard)

    wildcards = get_unique(wildcards)
    #read bed file
    read_df = pd.read_csv(args.reads, comment="#", header=None, sep="\t")

    # read gff file
    main_sheet = []

    header = ["Orientation", "Class", "Feature count"] + wildcards
    prefix_columns = len(read_df.columns) - len(wildcards)
    name_list = ["s%s" % str(x) for x in range(len(header))]
    nTuple = collections.namedtuple('Pandas', name_list)

    decode = {
        "srna": "sRNA",
        "5'-utr": "5'-UTR",
        "cds": "CDS",
        "rrna": "rRNA",
        "trna": "tRNA",
        "transcript": "transcript",
        "pseudogene": "pseudogene",
        "total": "total"
    }
    feature_list = [
        "srna", "5'-utr", "cds", "rrna", "trna", "transcript", "pseudogene",
        "total"
    ]
    read_dict = collections.OrderedDict()
    count_dict = collections.OrderedDict()

    for f in feature_list:
        read_dict[f] = [0] * len(wildcards)
        count_dict[f] = 0

    main_sheet = []
    for row in read_df.itertuples(index=False, name='Pandas'):
        reference_name = getattr(row, "_0")
        start = getattr(row, "_3")
        stop = getattr(row, "_4")
        feature = getattr(row, "_2")

        read_list = [
            getattr(row, "_%s" % x) for x in range(prefix_columns, len(row))
        ]
        if feature.lower() in read_dict:
            for idx, value in enumerate(read_list):
                read_dict[feature.lower()][idx] += value
                read_dict["total"][idx] += value
            count_dict[feature.lower()] += 1
            count_dict["total"] += 1
        else:
            print("feature not usable: " + feature)

    for key, val in read_dict.items():
        result = ["sense", decode[key], count_dict[key]
                  ] + [float("%.2f" % v) for v in val]

        main_sheet.append(nTuple(*result))

    main_df = pd.DataFrame.from_records(
        main_sheet, columns=[header[x] for x in range(len(header))])

    dataframe_dict = {"Main": main_df}

    eu.excel_writer(args.output, dataframe_dict, wildcards)
def xtail_output(args):
    # read the genome file
    genome_file = SeqIO.parse(args.genome, "fasta")
    genome_dict = dict()
    for entry in genome_file:
        genome_dict[str(entry.id)] = (str(entry.seq),
                                      str(entry.seq.complement()))

    annotation_dict = annotation_to_dict(args.annotation_file)

    diff_expr_df = pd.read_csv(args.input_csv, sep=",", comment="#")

    all_sheet = []
    header = [
        "Genome", "Start", "Stop", "Strand", "Locus_tag", "Old_locus_tag",
        "ID", "Name", "mRNA_log2FC", "RPF_log2FC", "log2FC_TE_v1", "pvalue_v1",
        "log2FC_TE_v2", "pvalue_v2", "log2FC_TE_final", "pvalue_final",
        "pvalue.adjust", "Length", "Codon_count", "Start_codon", "Stop_codon",
        "Nucleotide_seq", "Aminoacid_seq"
    ]
    name_list = [f"s{x}" for x in range(len(header))]
    nTuple = collections.namedtuple('Pandas', name_list)

    for row in diff_expr_df.itertuples(index=False, name='Pandas'):
        cds_id = getattr(row, "_0")

        if cds_id in annotation_dict:
            chromosome, start, stop, strand, attributes, locus_tag, old_locus_tag = annotation_dict[
                cds_id]
            column_info = retrieve_column_information(attributes)

        else:
            if ":" in cds_id and "-" in cds_id:
                chromosome, sec, strand = cds_id.split(":")
                start, stop = sec.split("-")
                column_info = ["", "", ""]
                locus_tag, old_locus_tag = "", ""
            else:
                sys.exit("Error... ID is not novel and not in the annotation!")

        mRNA_log2FC = getattr(row, "mRNA_log2FC")
        RPF_log2FC = getattr(row, "RPF_log2FC")
        log2FC_TE_v1 = getattr(row, "log2FC_TE_v1")
        pvalue_v1 = getattr(row, "pvalue_v1")
        log2FC_TE_v2 = getattr(row, "log2FC_TE_v2")
        pvalue_v2 = getattr(row, "pvalue_v2")
        log2FC_TE_final = getattr(row, "log2FC_TE_final")
        pvalue_final = getattr(row, "pvalue_final")
        pvalue_adjust = getattr(row, "_9")

        start = int(start)
        stop = int(stop)
        length = stop - start + 1
        codon_count = int(length / 3)
        start_codon, stop_codon, nucleotide_seq, aa_seq = get_genome_information(
            genome_dict[chromosome], start - 1, stop - 1, strand)

        result = [
            chromosome, start, stop, strand, locus_tag, old_locus_tag, cds_id,
            column_info[0], mRNA_log2FC, RPF_log2FC, log2FC_TE_v1, pvalue_v1,
            log2FC_TE_v2, pvalue_v2, log2FC_TE_final, pvalue_final,
            pvalue_adjust, length, codon_count, start_codon, stop_codon,
            nucleotide_seq, aa_seq
        ]

        all_sheet.append(nTuple(*result))

    all_df = pd.DataFrame.from_records(
        all_sheet, columns=[header[x] for x in range(len(header))])

    dataframe_dict = {"all": all_df}

    eu.excel_writer(args.output, dataframe_dict, [])