def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length, algorithm):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file,start_position=True)
    
    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)

    genome_part_l = 10000
    for genome in genomes:
        genome.calculate_signature()
        genome.parts = genome.split_seq(genome_part_l)
        for part in genome.parts:
            part.calculate_signature()
        genome.pseudo_par = model.fit_nonzero_parameters(\
            genome.parts, algorithm = algorithm)

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        for genome in genomes:
            if contig.id == genome.id:
                s = int(contig.start_position)
                start_part_index = s/genome_part_l
                end_part_index = (s+contig_length)/genome_part_l
                if start_part_index == end_part_index:
                    i = start_part_index
                    temp_pseudo_par = model.fit_nonzero_parameters(\
                        genome.parts[0:i]+genome.parts[i+1:],
                        algorithm=algorithm)
                else:
                    i1 = start_part_index
                    i2 = end_part_index
                    temp_pseudo_par = model.fit_nonzero_parameters(\
                        genome.parts[0:i1]+genome.parts[i2+1:],
                        algorithm=algorithm)

                p_val = model.log_probability(\
                    contig, temp_pseudo_par)
            else:
                p_val = model.log_probability(\
                    contig, genome.pseudo_par)
            scores.append(\
                Score(p_val, contig, genome, contig.contig_id))

    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for score in scores:
        sys.stdout.write(str(score) + '\n')
def main(open_name_file, dir_path,l):

    DNA.generate_kmer_hash(1)

    groups = read_parsed_taxonomy_file(open_name_file)

    # Read in the FASTA files for each genome
    read_FASTA_files(groups,dir_path)

    # For each bin, generate a number of contigs, 
    all_scores = []
    id_generator = Uniq_id(1000)
    for group_index,group in enumerate(groups):
        for genome in group.genomes:
            parts = genome.split_seq(l)
            print_parts(parts,sys.stdout, id_generator, genome)
def main(contigs_file, taxonomy_file, dir_path, kmer_length, dir_structure, taxonomy_info_in_contigs):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file, taxonomy_info=taxonomy_info_in_contigs)

    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path, dir_structure=dir_structure)

    for genome in genomes:
        genome.calculate_signature()
        genome.pseudo_par = mn.fit_nonzero_parameters([genome])

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        for genome in genomes:
            if contig.id == genome.id:
                temp_genome = deepcopy(genome)
                temp_genome.signature.subtract(contig.signature)
                temp_pseudo_par = mn.fit_nonzero_parameters([temp_genome])
                p_val = mn.log_probability(contig, temp_pseudo_par)
            else:
                p_val = mn.log_probability(contig, genome.pseudo_par)
            scores.append(Score(p_val, contig, genome, contig.contig_id, taxonomy_info=taxonomy_info_in_contigs))

    if taxonomy_info_in_contigs:
        sys.stdout.write(
            "p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    else:
        sys.stdout.write(
            "p_value\t\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id"
            + os.linesep
        )
    for score in scores:
        sys.stdout.write(str(score) + "\n")
def sample_contig(genome, x_st, contig_id, start_position=False):
    """ Generates a contig from genome genome

    :genome - DNA object
    :x_st - SampleSetting object
    :contig_id - The unique id given to this contig"""
    min_length = x_st.contig_min_length
    max_length = x_st.contig_max_length
    l = randint(min_length, max_length)
    gen_l = len(genome.full_seq)
    if x_st.debug_mode:
        start = 0
    else:
        start = randint(0, (gen_l - l))
    end = start + l
    contig = DNA(id=genome.id + " contig", seq=genome.full_seq[start:end])
    contig.contig_id = contig_id
    if start_position:
        contig.start_position = start
    return contig
def main(open_name_file, dir_path, x_set, start_position=False):

    try:
        DNA.generate_kmer_hash(2)
    except:
        pass

    groups = read_parsed_taxonomy_file(open_name_file)

    # Read in the FASTA files for each genome
    read_FASTA_files(groups,dir_path)

    # For each bin, generate a number of contigs, 
    all_scores = []
    id_generator = Uniq_id(1000)
    for group_index in range(len(groups)):
        group = groups[group_index]
        sg = SampleGroup(x_set, group, id_generator)
        sg.generate_group_contigs(start_position=start_position)
        sg.print_group_contigs(sys.stdout,start_position=start_position)
Example #6
0
def _get_contigs(arg_file,kmer):
    from probin.dna import DNA
    DNA.generate_kmer_hash(kmer)
    try:
        with open(arg_file) as handle:
            seqs = list(SeqIO.parse(handle,"fasta"))
    except IOError as error:    
        print >> sys.stderr, "Error reading file %s, message: %s" % (error.filename,error.message)
        sys.exit(-1)
    except Exception as error:
        print >> sys.stderr, "Error reading file %s, message: %s" % (error.filename,error.message)
        sys.exit(-1)

    contigs = [DNA(x.id, x.seq.tostring().upper(), calc_sign=True) for x in seqs]
    composition = np.zeros((len(contigs),DNA.kmer_hash_count))
    ids = []
    for i,contig in enumerate(contigs):
        composition[i] = np.fromiter(contig.pseudo_counts,dtype=np.int) - 1
        ids.append(contig.id)
    del contigs
    return composition,np.array(ids)
Example #7
0
def read_FASTA_files_no_groups(meta_genomes, dir_path,dir_structure='tree'):
    cur_dir = os.getcwd()
    if os.path.isfile(dir_path):
        os.chdir(os.path.dirname(dir_path))
        seq_file = os.path.basename(dir_path)
    else:
        os.chdir(dir_path) 
    if dir_structure == 'single_fasta_file':
        seq_list = list(SeqIO.parse(seq_file,"fasta"))
        seq_dic = {}
        for seq in seq_list:
            seq_dic[seq.id] = seq
    genomes = []
    for genome_data in meta_genomes:
        dir_name = genome_data['file_name']
        if dir_structure == 'tree':
            fasta_files = os.listdir(dir_name)
            for fasta_file in fasta_files:
                genome_file = open(dir_name + '/' + fasta_file)
                identifier = genome_file.readline()
                # Only use non-plasmid genomes
                # Some bacterial genomes contain more than 1 chromosome,
                # but assumed not more than 2
                if identifier.find('plasmid') == -1 and \
                        (identifier.find('complete genome') != -1 or\
                             identifier.find('chromosome 1') != -1):
                    # Close and reopen the same file
                    genome_file.close()
                    genome_file = open(dir_name + '/' + fasta_file)
                    genome_seq = list(SeqIO.parse(genome_file, "fasta"))
                    if len(genome_seq) > 1:
                        sys.stderr.write("Warning! The file " + fasta_file + " in directory " + dir_name + " contained more than one sequence, ignoring all but the first!" + os.linesep)
                    genome = DNA(id = dir_name, seq= str(genome_seq[0].seq))
                    genome.genus = genome_data['genus']
                    genome.species = genome_data['species']
                    genome.family = genome_data['family']
                    genomes.append(genome)
                genome_file.close()
        elif dir_structure == 'single_fasta_file':
            seq = seq_dic[genome_data['file_name']]
            
            genome = DNA(id = seq.id, seq= str(seq.seq))
            genome.genus = genome_data['genus']
            genome.species = genome_data['species']
            genome.family = genome_data['family']
            genomes.append(genome)

    os.chdir(cur_dir)
    return genomes
def main(contigs_file,taxonomy_file, dir_path, kmer_length, contig_length):

    groups = []
    DNA.generate_kmer_hash(kmer_length)

    contigs = read_contigs_file(contigs_file,start_position=True)
    
    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)

    genome_part_l = 10000
    for genome in genomes:
        genome.calculate_signature()
        genome.parts = genome.split_seq(genome_part_l)
        for part in genome.parts:
            part.calculate_signature()
        alpha_fit =  model.fit_nonzero_parameters_full_output(\
            genome.parts)
        sys.stderr.write(str(alpha_fit)+'\n')
        genome.pseudo_par = alpha_fit[0]

    scores = []
    for contig in contigs:
        contig.calculate_signature()
        contig.pseudo_counts_array = np.fromiter(contig.pseudo_counts,np.dtype('u4'),DNA.kmer_hash_count).reshape((1,DNA.kmer_hash_count))
        for genome in genomes:
            p_val = model.log_probability(\
                contig, genome.pseudo_par, pseudo_counts_supplied=True)
            scores.append(\
                Score(p_val, contig, genome, contig.contig_id))

    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for score in scores:
        sys.stdout.write(str(score) + '\n')
def main(contigs_file,contig_time_series_file, genome_time_series_file, taxonomy_file,dir_path, contig_length, total_read_count,assembly_length,first_data,last_data):

    DNA.generate_kmer_hash(2)

    contigs = read_contigs_file(contigs_file,start_position=True)
    
    contig_time_series_df = read_time_series(contig_time_series_file)

    if len(contigs)!=len(contig_time_series_df.index):
        raise TypeError("The number of contigs and time series does not match")
    
    for contig in contigs:
        contig.mapping_reads = contig_time_series_df[contig_time_series_df.contig_id == contig.contig_id]

    # Divide genomes into groups, one for each genus
    meta_genomes = genome_info_from_parsed_taxonomy_file(taxonomy_file)

    # Fetch sequence for each genome
    genomes = read_FASTA_files_no_groups(meta_genomes, dir_path)

    # Fetch time series for each genome
    read_time_series_file_genomes(genomes, genome_time_series_file)

    for genome in genomes:
        genome.pseudo_par = model.fit_nonzero_parameters([genome],total_read_count)

    scores = []
    for contig in contigs:
        for genome in genomes:
            p_val = model.log_probability(\
                    contig, genome.pseudo_par, total_read_count,assembly_length)
            scores.append(\
                Score(p_val, contig, genome, contig.contig_id))

    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for score in scores:
        sys.stdout.write(str(score) + '\n')
Example #10
0
def read_contigs_file(open_contigs_file, start_position=False,taxonomy_info=True):
    """ Read contigs file generated by generate_contigs script"""
    
    contigs = []
    seqs = list(SeqIO.parse(open_contigs_file, "fasta"))
    for seq in seqs:
        if taxonomy_info:
            contig_id_hash = parse_contig_description(seq.description, start_position=start_position)
            contig = DNA(id=contig_id_hash["genome"], seq=str(seq.seq))
            if start_position:
                contig.start_position = contig_id_hash["start_position"]
            contig.family = contig_id_hash["family"]
            contig.genus = contig_id_hash["genus"]
            contig.species = contig_id_hash["species"]
            contig.contig_id = contig_id_hash["contig_id"]
        else:
            contig = DNA(id=seq.id,seq=str(seq.seq))
            contig.contig_id = seq.id
        contigs.append(contig)

    return contigs
def main(open_name_file, dir_path, kmer_length, x_set):

    groups = []
    DNA.generate_kmer_hash(kmer_length)
    # Read the file with all names, divide them into groups
    for line in open_name_file:
        if line[0:12] == 'family_name:':
            family = line.split('\t')[1].strip()
        elif line[0:11] == 'genus_name:':
            genus = line.split('\t')[1].strip()
            new_group = GenomeGroup(genus)
            new_group.family = family
            groups.append(new_group)
        elif line[0:6] == 'entry:':
            genome_name = line.split('\t')[2].strip()
            genome_species = line.split('\t')[1].strip()
            meta_genome = {'id': genome_name,
                           'species': genome_species,
                           'genus': genus,
                           'family': family,
                           'file_name': genome_name
                          }
            groups[-1].genome_data.append(meta_genome)

    # Each genome in a group is a bin, fit parameters to all bins
    os.chdir(dir_path)
    for group in groups:
        for genome_data in group.genome_data:
            dir_name = genome_data['file_name']
            fasta_files = os.listdir(dir_name)
            for fasta_file in fasta_files:
                genome_file = open(dir_name + '/' + fasta_file)
                identifier = genome_file.readline()
                # Only use non-plasmid genomes
                # Some bacterial genomes contain more than 1 chromosonme,  
                # but assumed not more than 2
                if identifier.find('plasmid') == -1 and identifier.find('chromosome 2') == -1:
                    genome_file.close() #Close and reopen the same file
                    genome_file = open(dir_name + '/' + fasta_file)
                    genome_seq = list(SeqIO.parse(genome_file, "fasta"))
                    if len(genome_seq) > 1:
                        sys.stderr.write("Warning! The file " + fasta_file + " in directory " + dir_name + " contained more than one sequence, ignoring all but the first!" + os.linesep)
                    genome = DNA(id = dir_name, seq= str(genome_seq[0].seq))
                    genome.calculate_signature()
                    genome.genus = genome_data['genus']
                    genome.species = genome_data['species']
                    genome.family = genome_data['family']
                    group.genomes.append(genome)
                genome_file.close()

    # For each bin, generate a number of contigs, 
    # re-calculate parameters for that bin without contig-section.
    # Further score this contig against all bins, keep within-group
    # scores separate from outside-group scores.
    all_scores = []
    id_generator = Uniq_id(1000)
    for group_index in range(len(groups)):
        group = groups[group_index]
        rest_groups = all_but_index(groups, group_index)
        test = Experiment(x_set, group, rest_groups, id_generator)
        group_scores = test.execute()
        
        all_scores.append(group_scores)
    sys.stdout.write("p_value\tcontig_family\tcontig_genus\tcontig_species\tcontig_genome\tcompare_family\tcompare_genus\tcompare_species\tcompare_genome\tcontig_id" + os.linesep)
    for group_scores in all_scores:
        for genome_scores in group_scores:
            for score in genome_scores:
                sys.stdout.write(str(score) + '\n')