def get_all_gene_names(in_files):
    """
    Input: path to a folder with htseq-count output files
    Output: list of unique gene names in input files
    """
    genes_list = list()
    for file_counter, in_file in enumerate(in_files):
        in_data = hpf.l(in_file)
        file_gene_counter = 0
        for line in in_data:
            if line[0:2] != "__":
                split_line = line.split("\t")
                gene_name = split_line[0]
                if gene_name not in genes_list:
                    if file_counter == 0:
                        genes_list.append(gene_name)
                    else:
                        sys.stderr.write(
                            "Error: gene name " + gene_name + " in file " +
                            in_file +
                            " mismatches gene names in previously read file(s) in the same folder\n"
                        )
                        sys.exit(1)
                file_gene_counter += 1
        if file_gene_counter != len(genes_list):
            sys.stderr.write(
                "Error: mismatch in the number of genes between input files (encountered when reading "
                + in_file + ")\n")
            sys.exit(1)
    return genes_list
Beispiel #2
0
def main(input_vcf_file, out_path, window_step, exclude_samples):
    exclude_list = list()
    if exclude_samples != None:
        exclude_list = exclude_samples.split(" ")

    in_data = hpf.l(input_vcf_file)

    in_data = [n for n in in_data if n.startswith("#") == False or n.startswith("#CHROM") == True]
    header = in_data[0]
    split_header = header.split("\t")
    sample_names = split_header[9:len(split_header)]
    samples_count = len(sample_names)
    counts_dict = defaultdict(dict)
    in_data = in_data[1:len(in_data)]

    for line in in_data:
        split_line = line.split()
        sample_entries = split_line[9: len(split_line)]
        coord = int(split_line[1])
        bin = math.floor(coord / window_step)
        bin = "bin_" + str(bin).zfill(3)

        for i in range(0, samples_count):
            sample_name = sample_names[i]
            if sample_name not in exclude_list:
                sample_entry = sample_entries[i]
                if sample_name not in counts_dict[bin]:
                    counts_dict[bin][sample_name] = 0
                if sample_entry != ".:0,0:0:.:0,0":
                    allele = sample_entry.split(":")[0]
                    if allele != "0" and allele != ".":
                        counts_dict[bin][sample_name] += 1

    df = pd.DataFrame(counts_dict)
    df.to_csv(out_path)
def main(in_folder, out_path):
    in_files = hpf.get_file_paths(in_folder, "txt")
    if len(in_files) == 0:
        sys.stderr.write(
            "Error: no files with .txt extension was found in the input folder\n"
        )
        sys.exit(1)

    genes_list = get_all_gene_names(in_files)

    collection_dict = dict()
    for in_file in in_files:
        genes_dict = dict()
        sample_name = get_sample_name_from_file_name(in_file)

        for item in genes_list:
            genes_dict[item] = 0

        in_data = hpf.l(in_file)
        for line in in_data:
            if line[0:2] != "__":
                split_line = line.split("\t")
                count = int(split_line[1])
                gene_name = split_line[0]
                genes_dict[gene_name] = count
        collection_dict[sample_name] = genes_dict
    collection_df = pd.DataFrame(collection_dict)
    collection_df.to_csv(out_path, sep="\t", quoting=csv.QUOTE_NONE)
Beispiel #4
0
def load_dn_data(dn_path):
    """
    Loads the dN results table
    """
    dn_data = hpf.l(dn_path)
    dn_data = dn_data[1: len(dn_data)]
    dn_data = [n for n in dn_data if n.split()[1] != "NA"]
    return dn_data
def write_temp_alignment_file(alignment_file_path, temp_folder):
    """
    Writes an alignment file to the temporary files folder
    """
    fasta_data = hpf.l(alignment_file_path)
    out_file_path = temp_folder + "/temp_seq.fa"
    with open(out_file_path, "w") as out_file:
        for line in fasta_data:
            if line.startswith(">"):
                line = line.split("_")[0]
            out_file.write(line + "\n")
Beispiel #6
0
def main(input_vcf_file, genome_size, exclude_samples):
    exclude_list = list()
    if exclude_samples != None:
        exclude_list = exclude_samples.split(" ")

    in_data = hpf.l(input_vcf_file)

    in_data = [
        n for n in in_data
        if n.startswith("#") == False or n.startswith("#CHROM") == True
    ]
    header = in_data[0]
    split_header = header.split("\t")
    sample_names = split_header[9:len(split_header)]
    samples_count = len(sample_names)
    counts_dict = dict()
    for sample_name in sample_names:
        counts_dict[sample_name] = 0
    in_data = in_data[1:len(in_data)]

    print("Sample\tMean_nr_of_variants_per_10_kb")

    for line in in_data:
        split_line = line.split()
        sample_entries = split_line[9:len(split_line)]

        for i in range(0, samples_count):
            sample_name = sample_names[i]
            sample_entry = sample_entries[i]
            if sample_entry != ".:0,0:0:.:0,0":
                allele = sample_entry.split(":")[0]
                if allele != "0" and allele != ".":
                    counts_dict[sample_name] += 1

    freq_list = list()
    for sample in counts_dict:
        if sample not in exclude_list:
            sample_count = counts_dict[sample]
            sample_freq = sample_count / (genome_size / 10000)
            print(sample + "\t" + str(sample_freq))
            freq_list.append(sample_freq)

    print("-----")
    print("Averaged numbers across all samples")
    print("Mean number of variants per 10 kb:", np.mean(freq_list))
    print("Median number of variants per 10 kb:", np.median(freq_list))
    print("Standard deviation of the number of variants per 10 kb:",
          np.std(freq_list))
def main(in_path, out_folder, fasta_path, deselected_scaffolds_path):
    coords_df = None
    deselected_scaffolds = []

    if fasta_path == "":
        coords_df = get_scaffold_coords_by_source_features(in_path)
    else:
        coords_df = get_scaffold_coords_by_fasta(in_path, fasta_path)
        if deselected_scaffolds_path != "":
            deselected_scaffolds = hpf.l(deselected_scaffolds_path)

    records = list(SeqIO.parse(in_path, "embl"))
    t = os.system("mkdir -p " + out_folder)
    if t != 0:
        sys.stderr.write(
            "Error occurred when checking for the presence of output folder or creating the output folder ()"
            + out_folder + ")\n")
        sys.exit(1)

    for selected_scaff in range(0, coords_df.shape[0]):
        coords_df_entry = coords_df.iloc[selected_scaff]
        scaff_name = coords_df_entry["header"]
        scaff_id = coords_df_entry["id"]
        my_sequence_record = None

        if scaff_name not in deselected_scaffolds:
            query_start_coord = int(coords_df_entry.start_coord)
            query_end_coord = int(coords_df_entry.end_coord)
            out_path = out_folder + "/" + scaff_id + ".embl"

            union_seq = str(records[0].seq)
            seq = union_seq[query_start_coord - 1:query_end_coord - 1]
            my_sequence = Seq(seq)
            my_sequence_record = SeqRecord(my_sequence,
                                           id=scaff_id,
                                           name=scaff_name,
                                           description="unknown_description",
                                           dbxrefs=[])
            my_sequence_record.seq.alphabet = generic_dna
            my_sequence_record.accession = "unknown_accession"

            my_sequence_record = process_record_features(
                records, coords_df, query_start_coord, query_end_coord,
                my_sequence_record)

        SeqIO.write(my_sequence_record, out_path, "embl")
Beispiel #8
0
def load_pfam_domains(pfam_domains_path):
    """
    Loads a tab separated table where column 1 contains Hepatocystis gene names and column 2 contains the PFAM domains in the corresponding genes
    """
    domains_list = list()
    domains_data = hpf.l(pfam_domains_path)
    domains_data = domains_data[1: len(domains_data)]
    pfam_domains_dict = dict()
    for line in domains_data:
        split_line = line.split()
        pfam_domain_entry = split_line[1]
        if pfam_domain_entry != "NA":
            pfam_domains = pfam_domain_entry.split(",")
            pfam_domains_dict[split_line[0]] = pfam_domains
            for pfam_domain in pfam_domains:
                if pfam_domain not in domains_list:
                    domains_list.append(pfam_domain)
    return domains_list, pfam_domains_dict
def main(alignments_folder, output_folder, temp_folder, treefile_path):
    treefile_content = hpf.l(treefile_path)

    os.system("mkdir -p " + temp_folder)
    os.system("mkdir -p " + output_folder)
    os.chdir(temp_folder)

    alignment_files = os.listdir(alignments_folder)
    for alignment_file in alignment_files:
        alignment_file_path = alignments_folder + "/" + alignment_file
        write_temp_alignment_file(alignment_file_path, temp_folder)
        write_codeml_ctl_file("temp_seq.fa", "temp_tree.treefile",
                              "temp_out.txt", temp_folder)
        write_temp_treefile(treefile_content, temp_folder)
        os.system("codeml codeml.ctl")
        results_file_name = alignment_file.split(".fa")[0] + "_codeml.txt"
        results_file_path = output_folder + "/" + results_file_name
        os.system("cp " + temp_folder + "/temp_out.txt" + " " +
                  results_file_path)
def extract_sequences_from_fasta_by_id(args):
    """
    Function for extracting sequences from a FASTA file by their names. The sequence names are truncated at the first space character
    before they are compared to the query string. The function allows extracting sequences based on 1 query string and also reading a list of 
    query strings from a text file. There is an 'invert' mode to extract all sequences that do not match the query string(s).
    """
    selected_seq_list = None
    if args.string_query == True:
        selected_seq_list = [args.query]
    else:
        selected_seq_list = hpf.l(args.query)
    for header, seq in args.fasta_data:
        fasta_seq_id = header.split()[0]
        seq_to_output = False
        if args.invert == False:
            if fasta_seq_id in selected_seq_list:
                seq_to_output = True
        else:
            if fasta_seq_id not in selected_seq_list:
                seq_to_output = True
        if seq_to_output == True:
            print_header_and_seq(header, seq)