def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length, algorithm): groups = [] DNA.generate_kmer_hash(kmer_length) contigs = read_contigs_file(contigs_file,start_position=True) # Divide genomes into groups, one for each genus meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file) # Fetch sequence for each genome genomes = read_FASTA_files_no_groups(meta_genomes, dir_path) genome_part_l = 10000 for genome in genomes: genome.calculate_signature() genome.parts = genome.split_seq(genome_part_l) for part in genome.parts: part.calculate_signature() genome.pseudo_par = model.fit_nonzero_parameters(\ genome.parts, algorithm = algorithm) scores = [] for contig in contigs: contig.calculate_signature() for genome in genomes: if contig.id == genome.id: s = int(contig.start_position) start_part_index = s/genome_part_l end_part_index = (s+contig_length)/genome_part_l if start_part_index == end_part_index: i = start_part_index temp_pseudo_par = model.fit_nonzero_parameters(\ genome.parts[0:i]+genome.parts[i+1:], algorithm=algorithm) else: i1 = start_part_index i2 = end_part_index temp_pseudo_par = model.fit_nonzero_parameters(\ genome.parts[0:i1]+genome.parts[i2+1:], algorithm=algorithm) p_val = model.log_probability(\ contig, temp_pseudo_par) else: p_val = model.log_probability(\ contig, genome.pseudo_par) scores.append(\ Score(p_val, contig, genome, contig.contig_id)) sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep) for score in scores: sys.stdout.write(str(score) + '\n')
def main(open_name_file, dir_path,l): DNA.generate_kmer_hash(1) groups = read_parsed_taxonomy_file(open_name_file) # Read in the FASTA files for each genome read_FASTA_files(groups,dir_path) # For each bin, generate a number of contigs, all_scores = [] id_generator = Uniq_id(1000) for group_index,group in enumerate(groups): for genome in group.genomes: parts = genome.split_seq(l) print_parts(parts,sys.stdout, id_generator, genome)
def main(contigs_file, taxonomy_file, dir_path, kmer_length, dir_structure, taxonomy_info_in_contigs): groups = [] DNA.generate_kmer_hash(kmer_length) contigs = read_contigs_file(contigs_file, taxonomy_info=taxonomy_info_in_contigs) # Divide genomes into groups, one for each genus meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file) # Fetch sequence for each genome genomes = read_FASTA_files_no_groups(meta_genomes, dir_path, dir_structure=dir_structure) for genome in genomes: genome.calculate_signature() genome.pseudo_par = mn.fit_nonzero_parameters([genome]) scores = [] for contig in contigs: contig.calculate_signature() for genome in genomes: if contig.id == genome.id: temp_genome = deepcopy(genome) temp_genome.signature.subtract(contig.signature) temp_pseudo_par = mn.fit_nonzero_parameters([temp_genome]) p_val = mn.log_probability(contig, temp_pseudo_par) else: p_val = mn.log_probability(contig, genome.pseudo_par) scores.append(Score(p_val, contig, genome, contig.contig_id, taxonomy_info=taxonomy_info_in_contigs)) if taxonomy_info_in_contigs: sys.stdout.write( "p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep ) else: sys.stdout.write( "p_value\t\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep ) for score in scores: sys.stdout.write(str(score) + "\n")
def main(open_name_file, dir_path, x_set, start_position=False): try: DNA.generate_kmer_hash(2) except: pass groups = read_parsed_taxonomy_file(open_name_file) # Read in the FASTA files for each genome read_FASTA_files(groups,dir_path) # For each bin, generate a number of contigs, all_scores = [] id_generator = Uniq_id(1000) for group_index in range(len(groups)): group = groups[group_index] sg = SampleGroup(x_set, group, id_generator) sg.generate_group_contigs(start_position=start_position) sg.print_group_contigs(sys.stdout,start_position=start_position)
def _get_contigs(arg_file,kmer): from probin.dna import DNA DNA.generate_kmer_hash(kmer) try: with open(arg_file) as handle: seqs = list(SeqIO.parse(handle,"fasta")) except IOError as error: print >> sys.stderr, "Error reading file %s, message: %s" % (error.filename,error.message) sys.exit(-1) except Exception as error: print >> sys.stderr, "Error reading file %s, message: %s" % (error.filename,error.message) sys.exit(-1) contigs = [DNA(x.id, x.seq.tostring().upper(), calc_sign=True) for x in seqs] composition = np.zeros((len(contigs),DNA.kmer_hash_count)) ids = [] for i,contig in enumerate(contigs): composition[i] = np.fromiter(contig.pseudo_counts,dtype=np.int) - 1 ids.append(contig.id) del contigs return composition,np.array(ids)
def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length): groups = [] DNA.generate_kmer_hash(kmer_length) contigs = read_contigs_file(contigs_file,start_position=True) # Divide genomes into groups, one for each genus meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file) # Fetch sequence for each genome genomes = read_FASTA_files_no_groups(meta_genomes, dir_path) genome_part_l = 10000 for genome in genomes: genome.calculate_signature() genome.parts = genome.split_seq(genome_part_l) for part in genome.parts: part.calculate_signature() alpha_fit = model.fit_nonzero_parameters_full_output(\ genome.parts) sys.stderr.write(str(alpha_fit)+'\n') genome.pseudo_par = alpha_fit[0] scores = [] for contig in contigs: contig.calculate_signature() contig.pseudo_counts_array = np.fromiter(contig.pseudo_counts,np.dtype('u4'),DNA.kmer_hash_count).reshape((1,DNA.kmer_hash_count)) for genome in genomes: p_val = model.log_probability(\ contig, genome.pseudo_par, pseudo_counts_supplied=True) scores.append(\ Score(p_val, contig, genome, contig.contig_id)) sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep) for score in scores: sys.stdout.write(str(score) + '\n')
def main(contigs_file,contig_time_series_file, genome_time_series_file, taxonomy_file,dir_path, contig_length, total_read_count,assembly_length,first_data,last_data): DNA.generate_kmer_hash(2) contigs = read_contigs_file(contigs_file,start_position=True) contig_time_series_df = read_time_series(contig_time_series_file) if len(contigs)!=len(contig_time_series_df.index): raise TypeError("The number of contigs and time series does not match") for contig in contigs: contig.mapping_reads = contig_time_series_df[contig_time_series_df.contig_id == contig.contig_id] # Divide genomes into groups, one for each genus meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file) # Fetch sequence for each genome genomes = read_FASTA_files_no_groups(meta_genomes, dir_path) # Fetch time series for each genome read_time_series_file_genomes(genomes, genome_time_series_file) for genome in genomes: genome.pseudo_par = model.fit_nonzero_parameters([genome],total_read_count) scores = [] for contig in contigs: for genome in genomes: p_val = model.log_probability(\ contig, genome.pseudo_par, total_read_count,assembly_length) scores.append(\ Score(p_val, contig, genome, contig.contig_id)) sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep) for score in scores: sys.stdout.write(str(score) + '\n')
def main(open_name_file, dir_path, kmer_length, x_set): groups = [] DNA.generate_kmer_hash(kmer_length) # Read the file with all names, divide them into groups for line in open_name_file: if line[0:12] == 'family_name:': family = line.split('\t')[1].strip() elif line[0:11] == 'genus_name:': genus = line.split('\t')[1].strip() new_group = GenomeGroup(genus) new_group.family = family groups.append(new_group) elif line[0:6] == 'entry:': genome_name = line.split('\t')[2].strip() genome_species = line.split('\t')[1].strip() meta_genome = {'id': genome_name, 'species': genome_species, 'genus': genus, 'family': family, 'file_name': genome_name } groups[-1].genome_data.append(meta_genome) # Each genome in a group is a bin, fit parameters to all bins os.chdir(dir_path) for group in groups: for genome_data in group.genome_data: dir_name = genome_data['file_name'] fasta_files = os.listdir(dir_name) for fasta_file in fasta_files: genome_file = open(dir_name + '/' + fasta_file) identifier = genome_file.readline() # Only use non-plasmid genomes # Some bacterial genomes contain more than 1 chromosonme, # but assumed not more than 2 if identifier.find('plasmid') == -1 and identifier.find('chromosome 2') == -1: genome_file.close() #Close and reopen the same file genome_file = open(dir_name + '/' + fasta_file) genome_seq = list(SeqIO.parse(genome_file, "fasta")) if len(genome_seq) > 1: sys.stderr.write("Warning! The file " + fasta_file + " in directory " + dir_name + " contained more than one sequence, ignoring all but the first!" + os.linesep) genome = DNA(id = dir_name, seq= str(genome_seq[0].seq)) genome.calculate_signature() genome.genus = genome_data['genus'] genome.species = genome_data['species'] genome.family = genome_data['family'] group.genomes.append(genome) genome_file.close() # For each bin, generate a number of contigs, # re-calculate parameters for that bin without contig-section. # Further score this contig against all bins, keep within-group # scores separate from outside-group scores. all_scores = [] id_generator = Uniq_id(1000) for group_index in range(len(groups)): group = groups[group_index] rest_groups = all_but_index(groups, group_index) test = Experiment(x_set, group, rest_groups, id_generator) group_scores = test.execute() all_scores.append(group_scores) sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep) for group_scores in all_scores: for genome_scores in group_scores: for score in genome_scores: sys.stdout.write(str(score) + '\n')