def gc_count_fasta(fasta_dict, name): """ Function to count all n-grams/k-mers (substrings of lenght n or k) in a big string/genome. Inputs: fasta_dict - a dictionary-like object that map a word/kmer to their value, in this case a full path to the files to be analized. name - a string representing a word (key) that represent a key in a dictionary. Outputs: gc content - a float representing the mean of the gc content from all genus/species analyzed. """ # get the number of files in the names directory num_fastas = len(fasta_dict[name]) # initialize the counter gc_tot = 0 # iterates through the list of paths for filename in fasta_dict[name]: # reads the file and parse the content print(f'Reading and parsing the filename {filename}') for name, sequence in parse_fasta(filename): # add the gc content from all files gc_tot += gc_cython(sequence) # returns the mean of the gc content from all files return (gc_tot / num_fastas) * 100
def get_genome_length(filenames): gen_len = defaultdict(dict) for filename in filenames: genus = filename.split('/')[2] length = 0 for name, seq in parse_fasta(filename): length += len(seq) gen_len[genus][name] = len(seq) return gen_len
def genome_stats_in_windows(fasta_dict, name, as_overlap=False, k=20): """GC Content in a DNA/RNA sub-sequence length k. In overlapp windows of lenght k. Inputs: sequence - a string representing a DNA sequence. as_overlap - boolean that represents if overlap is needed. k - a integer reppresenting the lengths of overlappig bases. Default is 20. Outputs: gc_content - an array-like object with """ seq = '' for file in fasta_dict[name]: for n, seq in parse_fasta(file): # make sequence upper case and getting the length of it seq += seq.upper() # the array-like object to collect the data gc_content = [] # non overlap sequence length non_overlap = range(0, len(seq) - k + 1, k) # overlap sequence length overlap = range(0, len(seq) - k + 1) # overlap is needed if as_overlap: # iterates to the overlap region for i in overlap: # creates the substring to count the gc_content subseq = seq[i:i + k] # count and sum up the Gs and Cs counts g_c = gc_cython(subseq) # collect the data in the array container gc_content.append((i, round(g_c, 4) * 100)) # if non overlap is choosed else: # iterates to the mon overlap region for j in non_overlap: # creates the substring to count the gc_content subseq = seq[j:j + k] # count and sum up the Gs and Cs counts g_c = gc_cython(subseq) # collect the data in the array container gc_content.append((j, round(g_c, 4) * 100)) return gc_content
def count_n_grams_fasta(fasta_dict, name, alphabet, kmin, kmax): """ Function to count all n-grams/k-mers (substrings of lenght n or k) in a big string/genome. Inputs: fasta_dict - a dictionary-like object that map a word/kmer to their value, in this case a full path to the files to be analized. name - a string representing a word (key) that represent a key in a dictionary. kmin - a integer representing the lower bound of the kmer/n-gram length. kmax - a integer representing the maximum bound of the kmer/n-gram length. Outputs: final_counter - a dictionary-like mapping the kmers to their calculated count in the input string, from a file. """ # alphabet as a set alphabet = set(alphabet) # get the number of files in the names directory num_fastas = len(fasta_dict[name]) print(f'The number of fasta files for this genus is {num_fastas}.') # initialyze the counter counter = Counter() # iterates through the list of paths for filename in fasta_dict[name]: # reads the file and parse the content print(f'Reading and parsing the file {filename}') for name, sequence in parse_fasta(filename): print(f'Sequence length {len(sequence)}') # get the counting the kmers cnt = count_kmers(sequence, kmin, kmax, counter=None) # add the count of the current file to the counter counter.update(cnt) # to get the mean of the kmer count for all the files final_counter = { k: (c // num_fastas) for k, c in counter.items() if set(k).issubset(alphabet) } return final_counter
def count_bases_fasta(fasta_dict, name): """ Function to count all n-grams/k-mers (substrings of lenght n or k) in a big string/genome. Inputs: fasta_dict - a dictionary-like object that map a word/kmer to their value, in this case a full path to the files to be analized. name - a string representing a word (key) that represent a key in a dictionary. Outputs: final_counter - a dictionary-like mapping the kmers to their calculated count in the input string, from a file. seq_length - a integer representing the mean of the lengths from all genomes files in the directory. """ # get the number of files in the names directory num_fastas = len(fasta_dict[name]) print(f'The number of fasta files for this genus is {num_fastas}.') # initialize the counter counter = Counter() # get the sequence length seq_len = 0 num_files = 0 # iterates through the list of paths for filename in fasta_dict[name]: # reads the file and parse the content print(f'Reading and parsing the file {filename}') for name, sequence in parse_fasta(filename): print(f'Sequence length {len(sequence)}') seq_len += len(sequence) # get the counting the kmers cnt = count_bases_cython(sequence) # add the count of the current file to the counter counter.update(cnt) num_files += 1 # to get the mean of the kmer count for all the files final_counter = {k: (c // num_fastas) for k, c in counter.items()} return final_counter
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the root directory to save the final result dir_out = args.dir_out # path and name of the text file with the patterns pattern_file = args.pattern_file # get the list of all paths to the files in the input directory # ex., Data/Genomes_splitted all_files = get_files(dir_in) # get all patterns all_patterns = read_patterns(pattern_file) # check if the output directory existe other wise create it if os.path.exists(dir_out): print( colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # initialize the file counter num_files = 0 # input the file paths and print it to show where the script is doing for filen in all_files: name = filen.split('/')[2] data = filen.split('/')[3] print( colored(f"Working with {data} from genus/species {name}", attrs=['bold'])) # get the search done for n, seq in parse_fasta(filen): print( f'Start counting the restriction enzymes cut sites in the sequence {n}' ) cut_sites = all_re_cut_sites(seq, all_patterns) df = pd.DataFrame(cut_sites, columns=['site', 'positions']) full_path = os.path.join(dir_out, name, 'RE_cuts') file_name = f'{n}_{data}_re_cuts.csv' if not os.path.exists(full_path): os.makedirs(full_path) print(f'Saving the files in {full_path}\n') df.to_csv(f'{full_path}/{file_name}', index=False) # the number of files analyzed num_files += 1 # the final time end = time() # print some info print( colored(f"Total number of files analyzed: {num_files}\n.", attrs=['bold'])) print( colored( f'Total time for the script finishes: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
def main(): """Parses options from the command line. Computes the k-mers to test (either palindromes or all k-mers). Computes the counts of k-mers in fasta files, and add the reverse complements of the sequence data to the counts. Computes the k-mers/palindromes statistics (expected value, z-scores and e-values), And if definide by user prints the results to stdout, else save to a csv file. """ cwd = os.getcwd() print(f'The working directory: {cwd}\n') start_time = time.process_time() opt = parse_arguments() dir_name = opt.path filenames = get_files(dir_name) outfile = opt.output dir_out = opt.dir_out if os.path.exists(dir_out): pass else: make_me_a_folder(dir_out) cnt_files = 0 for filename in filenames: for name, seq in fasta_parser.parse_fasta(filename): name = fasta_parser.str_punctuation_strip(name) n_name = '_'.join(name[0:3] + name[-3:]) seq = seq len_seq = len(seq) - count_umbiguous_bases(seq) if opt.kmer: kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2, opt.max_k) kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k, opt.max_k) kmer_freqs = kmers_frequencies(kmer_counts) kmer_expected = get_expected_values(kmer_list, kmer_counts) kmer_zscores = get_z_scores(kmer_list, kmer_counts, kmer_expected, len_seq) kmer_pvalues = get_pvalues(kmer_list, kmer_zscores) kmer_evalues = get_evalues(kmer_list, kmer_pvalues) kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected) kmer_nscores = get_new_scores(kmer_list, kmer_counts, kmer_expected) kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs) kmer_diff = get_difference(kmer_list, kmer_counts, kmer_expected) kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected) kmer_data = get_kmer_statistics(kmer_list, kmer_counts, kmer_expected, kmer_zscores, kmer_evalues, kmer_odds_ratio, kmer_diff, kmer_scores, kmer_nscores, kmer_lod) print_results_stats(n_name, kmer_list, len_seq, opt.min_k, opt.max_k, opt.max_e, kmer_data) df = pd.DataFrame(kmer_data, columns=[ "kmer", "Observed", "Expected", "Z_score", "Evalues", "Odds", "Diff", "Scores", "NScores", "Log_odds" ]) df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_kmer_stats.csv") with open(f"{dir_out}/{n_name}_{opt.max_k}_kmer_counts.csv", 'w') as fout: fout.write('Kmer,Counts\n') for kmer, count in kmer_counts.items(): fout.write(kmer + "," + str(count) + "\n") if opt.pal: n = len_seq pal_list = list( get_palindromes(opt.alphabet, opt.min_k, opt.max_k)) # counts = counts of the kmers/palindromes with min_k-2 <= k <= max_k pal_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2, opt.max_k) # as palindromes are the need to count both strands rev_strand_cnt = dict((get_reverse_complement(kmer), cnt) for kmer, cnt in pal_counts.items()) for kmer, cnt in rev_strand_cnt.items(): pal_counts[kmer] += cnt n *= 2 pal_freqs = kmers_frequencies(pal_counts) pal_expected = get_expected_values(pal_list, pal_counts) pal_zscores = get_z_scores(pal_list, pal_counts, pal_expected, len_seq) pal_pvalues = get_pvalues(pal_list, pal_zscores) pal_evalues = get_evalues(pal_list, pal_pvalues) pal_scores = get_scores(pal_list, pal_counts, pal_expected) pal_nscores = get_new_scores(pal_list, pal_counts, pal_expected) pal_odds_ratio = get_odds_ratio(pal_list, pal_freqs) pal_diff = get_difference(pal_list, pal_counts, pal_expected) pal_lod = get_log_odds(pal_list, pal_counts, pal_expected) pal_data = get_kmer_statistics(pal_list, pal_counts, pal_expected, pal_zscores, pal_evalues, pal_odds_ratio, pal_diff, pal_scores, pal_nscores, pal_lod) print_results_stats(n_name, pal_list, len_seq, opt.min_k, opt.max_k, opt.max_e, pal_data) df = pd.DataFrame(pal_data, columns=[ "pal", "Observed", "Expected", "Z_score", "Evalues", "Odds", "Diff", "Scores", "NScores", "Log_odds" ]) df.to_csv(f"{dir_out}/{n_name}_{opt.max_k}_all_pal_stats.csv") with open( f"{dir_out}/{n_name}_{opt.max_k}_palindrome_counts.csv", 'w') as fout: fout.write('Palindrome,Counts\n') for pal, count in pal_counts.items(): fout.write(pal + "," + str(count) + "\n") if opt.all: kmer_counts = count_kmers(seq, opt.alphabet, opt.min_k - 2, opt.max_k) kmer_list = get_all_possible_kmers(opt.alphabet, opt.min_k, opt.max_k) kmer_freqs = kmers_frequencies(kmer_counts) kmer_expected = get_expected_values(kmer_list, kmer_counts) kmer_zscores = get_z_scores(kmer_list, kmer_counts, kmer_expected, len_seq) kmer_pvalues = get_pvalues(kmer_list, kmer_zscores) kmer_evalues = get_evalues(kmer_list, kmer_pvalues) kmer_scores = get_scores(kmer_list, kmer_counts, kmer_expected) kmer_nscores = get_new_scores(kmer_list, kmer_counts, kmer_expected) kmer_odds_ratio = get_odds_ratio(kmer_list, kmer_freqs) kmer_diff = get_difference(kmer_list, kmer_counts, kmer_expected) kmer_lod = get_log_odds(kmer_list, kmer_counts, kmer_expected) kmer_data = get_kmer_statistics(kmer_list, kmer_counts, kmer_expected, kmer_zscores, kmer_evalues, kmer_odds_ratio, kmer_diff, kmer_scores, kmer_nscores, kmer_lod) get_dataframe_from_kmer_data(dir_out, outfile, opt.max_k, kmer_data) data_dict = defaultdict(list) for data in kmer_data: kmer = data[0] obs = data[1] exp = data[2] zscr = data[3] eval = data[4] data_dict[kmer] = data_dict.get( kmer, []) + [obs, exp, zscr, eval] with open(f'{dir_out}/{outfile}_all_kmers_z_scores.csv', 'w') as fout: fout.write('kmer, data\n') for kmer, data in data_dict.items(): fout.write(kmer + ',' + str(data) + '\n') if opt.slide: kmer_slide = get_kmer_count_slide_window( seq, opt.alphabet, opt.window, opt.step, opt.min_k, opt.max_k) df = pd.DataFrame.from_dict(kmer_slide).fillna(0.0) df.to_csv(f"{dir_out}/{n_name}_slide_window.csv") cnt_files += 1 end = time.process_time() total_time = end - start_time print(f'The script takes {total_time} to finish!') print(f'Where read and manipulated {cnt_files} files') print('Done!')
#!usr/bin/env python import sys from fasta_parser import fasta_item_counter, parse_fasta from system_utils import get_fasta_files if len(sys.argv) < 2: print('USAGE: < count_assemblies_with_plasmids.py > < directory name > ') sys.exit(1) path = sys.argv[1] assemblies_plasmids = [] cnt = 0 for filename in get_fasta_files(path): name = filename.split('/')[-1] headers = [ header for header in parse_fasta(filename) if 'plasmid' in header ] cnt += len(headers) assemblies_plasmids.append((set(headers), cnt)) print(f'The number of assemblies with plasmids are {cnt}') with open('assemblies_with_plasmids.txt', 'w') as fo: for name in assemblies_plasmids: fo.write(f'{name}\n')
def main(): # starting count the staring time of the script start = time() # checking the current directory and printing it cwd = os.getcwd() print(colored(f'\nThe working directory: {cwd}\n', 'green', attrs=['bold'])) # passing the arguments to the script args = parse_arguments() # name of the input diretory, ex. Data/Genomes_splitted dir_in = args.dir_in # name of the sub directory to save the final result # Chromosomes/Plasmids sub_dir = args.sub_dir # sub_sub dir name, ex., kmers/palindromes sub_sub_dir = args.sub_sub_dir # name of the root directory to save the final result dir_out = args.dir_out # alphabet alphabet = iupac_dna # get the list of all paths to the files in the input directory filenames = get_fasta_files(dir_in) # check if the output directory existe other wise create it if os.path.exists(dir_out): print(colored('The directory to save the files already exists!', 'red', attrs=['bold'])) pass else: make_me_a_folder(dir_out) # initialyze the file counter cnt_files = 0 # input the file paths and print it to show where the script is doing for filename in filenames: print(colored(f"File: {filename}", attrs=['bold'])) # Data/Genomes_splitted/Genus # name of the taxon directory, ie. Acidisarcina # and get sub sub directory name genus = filename.split('/')[2] # read in the sequences and ids for seq_id, sequence in parse_fasta(filename): # get sequence length seq_len = len(sequence) print(f'Sequence length {seq_len}.') bases = count_all_bases(sequence) # Results/Genus/Bases path = os.path.join(dir_out, genus, sub_dir, sub_sub_dir) if not os.path.exists(path): os.makedirs(path) print(f'Saving the results in {path}\n') base_content_slide_window(sequence, path, seq_id, alphabet, 5000, 500, plot=True) with open(f'{path}/{seq_id}_bases.csv', 'w') as fout: fout.write('base,count\n') for base, cnt in bases.items(): fout.write(base + ',' + str(cnt) + '\n') if not os.path.exists(path): os.makedirs(path) cnt_files += 1 # the final time end = time() # print some info print(colored(f"Total number of files: {cnt_files}\n.", attrs=['bold'])) print(colored(f'Total time for the script: {round(end - start, 2)}.', 'red', attrs=['bold'])) print(colored('Done!', 'green', attrs=['bold']))
from dollarsign import dollarsign_matches from fasta_parser import parse_fasta from fastq_parser import parse_fastq reads_file = 'data/reads.fastq' refs_file = 'data/ref.fa' # reads (fastq) in outer loop for read in parse_fastq(reads_file): print(read) for reference in parse_fasta()