def trim_additional_merged_contigs(original_contig, merged_contig):
    open_merged=open(merged_contig)
    fasta_reader=FastaReader(open_merged)
    header, sequence = fasta_reader.next()
    sp_header = header.split('_')
    trim=int(sp_header[-2].strip('os'))
    cigar=sp_header[-1]
    all_cigar = re.findall('(\d+)([MXDI])', cigar)
    for count, type  in all_cigar:
        if type=="M" or type=="X":
            trim+=int(count)
    
    open_merged.close()
    if trim>50:
        logging.info("trim %s of %s"%(trim, original_contig))
        open_contig=open(original_contig)
        fasta_reader=FastaReader(open_contig)
        header, sequence,fasta_reader.next()
        header=header+"trim_%s"%trim
        sequence=sequence[trim:]
        open_contig.close()
        open_contig=open(original_contig, 'w')
        if len(sequence)>100:
            open_contig.write(">%s\n%s\n"%(header,sequence))
        open_contig.close()
Beispiel #2
0
def get_fasta_length(fasta_file):
    length = 0
    with open(fasta_file) as open_fasta:
        reader = FastaReader(open_fasta)
        header, sequence = reader.next()
        length = len(sequence)

    return length
def get_fasta_length(fasta_file):
    length=0
    with open(fasta_file) as open_fasta:
        reader = FastaReader(open_fasta)
        header,sequence = reader.next()
        length=len(sequence)
        
    return length
Beispiel #4
0
 def __init__(self,
              genome_file,
              keep_in_memory=True,
              keep_until_done=False,
              prefix=''):
     self.open_genome_file = open_input_file(genome_file)
     self.reader = FastaReader(self.open_genome_file)
     self.keep_in_memory = keep_in_memory
     self.keep_until_done = keep_until_done
     self.prefix = prefix
     self.all_chr = {}
Beispiel #5
0
def correct_contig_file(contig_file, site_name, min_contig_len=101):
    dir_name = os.path.dirname(contig_file)
    corrected_file = os.path.join(dir_name, 'contigs_corrected.fa')
    open_file = open(contig_file)
    open_corrected = open(corrected_file, 'w')
    fasta_reader = FastaReader(open_file)
    nb_seq = 0
    max_len = 0
    for fasta_record in fasta_reader:
        header, sequence = fasta_record
        length_A = sum([len(e) for e in re.findall('[A]{7,}', sequence)])
        length_C = sum([len(e) for e in re.findall('[C]{7,}', sequence)])
        length_G = sum([len(e) for e in re.findall('[G]{7,}', sequence)])
        length_T = sum([len(e) for e in re.findall('[T]{7,}', sequence)])
        total_repeat = sum([length_A, length_C, length_G, length_T])

        if len(sequence
               ) > min_contig_len and float(total_repeat) / len(sequence) < .5:
            nb_seq += 1
            if len(sequence) > max_len:
                max_len = len(sequence)
            header = '%s_pair_%s_length_%s' % (site_name, nb_seq,
                                               len(sequence))
            open_corrected.write('>%s\n%s\n' % (header, sequence))
    open_corrected.close()
    open_file.close()
    return (corrected_file, nb_seq, max_len)
Beispiel #6
0
 def __init__(self,genome_file, keep_in_memory=True, keep_until_done=False, prefix=''):
     self.open_genome_file=open_input_file(genome_file)
     self.reader=FastaReader(self.open_genome_file)
     self.keep_in_memory=keep_in_memory
     self.keep_until_done=keep_until_done
     self.prefix=prefix
     self.all_chr={}
Beispiel #7
0
def merge_read1_and_read2_contigs(name, read1_contig, read2_contigs,
                                  output_dir):
    open_read2 = open(read2_contigs)
    all_fasta2_entries = []
    read2_reader = FastaReader(open_read2)
    for header, sequence in read2_reader:
        all_fasta2_entries.append((header, sequence))
    if len(all_fasta2_entries) == 1:
        merged_contigs_info = merge_2_contigs(name, read1_contig,
                                              read2_contigs, output_dir)
        if merged_contigs_info:
            merged_contig_file = os.path.join(output_dir,
                                              'tmp_merged_consensus.fa')
            with open(merged_contig_file, 'w') as open_output:
                open_output.write('>%s\n%s\n' % merged_contigs_info)
            return merged_contig_file
        else:
            return None
    else:
        all_fasta2_entries.sort(cmp=compare_fasta_length)
        merged_pair = None
        remaining = []
        for header, sequence in all_fasta2_entries:
            cur_pair = os.path.join(output_dir, header + ".fa")
            open_pair = open(cur_pair, 'w')
            open_pair.write(">%s\n%s\n" % (header, sequence))
            open_pair.close()
            if not merged_pair:
                merged_pair = merge_2_contigs(name, read1_contig, cur_pair,
                                              output_dir)
                if not merged_pair:
                    remaining.append(cur_pair)
            else:
                results = merge_2_contigs(name + "add", read1_contig, cur_pair,
                                          output_dir)
                #TODO: fix this as it doesn't seems to trim and the output file is the same as above but in the mean time disable using False
                if False and results:
                    additional_merged_pair = os.path.join(
                        output_dir, 'tmp_merged_consensus.fa')
                    with open(additional_merged_pair, 'w') as open_output:
                        open_output.write('>%s\n%s\n' % results)

                    #trim this contig
                    trim_additional_merged_contigs(cur_pair,
                                                   additional_merged_pair)
                remaining.append(cur_pair)

        merge_file = os.path.join(output_dir, "merged_consensus.fa")
        if merged_pair:
            merged_pair_file = os.path.join(output_dir,
                                            'tmp_merged_consensus.fa')
            with open(merged_pair_file, 'w') as open_output:
                open_output.write('>%s\n%s\n' % merged_pair)
            tmp = [merged_pair_file]
            tmp.extend(remaining)
            concatenate_consensus(tmp, merge_file)
            return merge_file
        else:
            return None
Beispiel #8
0
def force_merge_consensus(read1_consensus, read2_consensus, output_merge_file):
    open_output = open(output_merge_file, 'w')
    open_read1 = open(read1_consensus)
    open_read2 = open(read2_consensus)
    fasta_reader1 = FastaReader(open_read1)
    read1_name, read1_sequence = fasta_reader1.next()
    open_read1.close()
    name = "%s_forced_merged" % read1_name
    array = [read1_sequence]
    fasta_reader2 = FastaReader(open_read2)
    for read2_name, read2_sequence in fasta_reader2:
        name = "%s_%s" % (name, read2_name)
        array.append("N" * 100)
        array.append(read2_sequence)

    open_output.write(">%s\n%s\n" % (name, ''.join(array)))
    open_read2.close()
    open_output.close()
def force_merge_consensus(read1_consensus, read2_consensus, output_merge_file):
    open_output = open(output_merge_file,'w')
    open_read1 = open(read1_consensus)
    open_read2 = open(read2_consensus)
    fasta_reader1 = FastaReader(open_read1)
    read1_name, read1_sequence = fasta_reader1.next()
    open_read1.close()
    name="%s_forced_merged"%read1_name
    array=[read1_sequence]
    fasta_reader2 = FastaReader(open_read2)
    for read2_name, read2_sequence in fasta_reader2:
        name="%s_%s"%(name,read2_name)
        array.append("N"*100)
        array.append(read2_sequence)
    

    open_output.write(">%s\n%s\n"%(name, ''.join(array)))
    open_read2.close()
    open_output.close()
Beispiel #10
0
def get_basic_stats(contig_file, all_sites):
    open_file = open(contig_file)
    fasta_reader = FastaReader(open_file)
    for fasta_record in fasta_reader:
        header, sequence = fasta_record
        match = re.match('(.+)_pair_\d+_length_\d+', header)
        site_name = match.group(1)
        all_sites[site_name]["number_contig"] += 1
        if len(sequence) > all_sites[site_name].get("max_length"):
            all_sites[site_name]["max_length"] = len(sequence)
    return all_sites
Beispiel #11
0
def get_list_of_length(contig_file):
    open_file = open(contig_file)
    list_length = []
    nb_contig = 0
    max_length = 0
    fasta_reader = FastaReader(open_file)
    for fasta_record in fasta_reader:
        header, sequence = fasta_record
        nb_contig += 1
        if len(sequence) > max_length: max_length = len(sequence)
    return nb_contig, max_length
def create_sequence_dict_from_contigs_file(contig_file):
    print "Read %s" % contig_file
    sequence_dictionary = []
    all_names = {}
    with open(contig_file) as open_file:
        for header, sequence in FastaReader(open_file):
            sequence_dictionary.append({"SN": header, "LN": len(sequence)})
            if all_names.has_key(header):
                raise StandardError("Duplicated reference name %s in %s" %
                                    (header, contig_file))
            all_names[header] = 1
    return sequence_dictionary
Beispiel #13
0
def trim_additional_merged_contigs(original_contig, merged_contig):
    open_merged = open(merged_contig)
    fasta_reader = FastaReader(open_merged)
    header, sequence = fasta_reader.next()
    sp_header = header.split('_')
    trim = int(sp_header[-2].strip('os'))
    cigar = sp_header[-1]
    all_cigar = re.findall('(\d+)([MXDI])', cigar)
    for count, type in all_cigar:
        if type == "M" or type == "X":
            trim += int(count)

    open_merged.close()
    if trim > 50:
        logging.info("trim %s of %s" % (trim, original_contig))
        open_contig = open(original_contig)
        fasta_reader = FastaReader(open_contig)
        header, sequence, fasta_reader.next()
        header = header + "trim_%s" % trim
        sequence = sequence[trim:]
        open_contig.close()
        open_contig = open(original_contig, 'w')
        if len(sequence) > 100:
            open_contig.write(">%s\n%s\n" % (header, sequence))
        open_contig.close()
Beispiel #14
0
def analyse_consensus_file(consensus_file):
    with open(consensus_file) as open_file:
        fasta_reader = FastaReader(open_file)
        number_read2_contig = 0
        longuest_contigs = 0
        is_merged = False
        for header, sequence in fasta_reader:
            #_merged_os0_71D21M193I or _pair_1_length_452
            sp_header = header.split('_')
            if len(sp_header) > 2 and sp_header[-3] == 'merged':
                type = 'merged'
                number_read2_contig += 1
                if len(sequence) > longuest_contigs:
                    longuest_contigs = len(sequence)
                is_merged = True
            elif len(sp_header) > 3 and sp_header[-4] == 'pair':
                type = 'read2_contig'
                number_read2_contig += 1
                if len(sequence) > longuest_contigs:
                    longuest_contigs = len(sequence)
            else:
                type = 'read1_contig'
    return number_read2_contig, longuest_contigs, is_merged
Beispiel #15
0
class GenomeLoader:
    """Genome Loader take a fasta file and try to find a given chromosome in it.
    You can specify so it also keep the all the loaded unused chromosome into memory until they are required (keep_in_memory=True).
    You can also specify so it keep the all the loaded chromosome until the object is destroyed (keep_until_done=True).
    You can provide a prefix that will be added to the chromsome names if not already there.
    """
    def __init__(self,
                 genome_file,
                 keep_in_memory=True,
                 keep_until_done=False,
                 prefix=''):
        self.open_genome_file = open_input_file(genome_file)
        self.reader = FastaReader(self.open_genome_file)
        self.keep_in_memory = keep_in_memory
        self.keep_until_done = keep_until_done
        self.prefix = prefix
        self.all_chr = {}

    def load_all(self):
        self.get_chr('***********************************************')
        return self.all_chr.keys()

    def get_chr(self, chr):
        if not chr.startswith(self.prefix):
            chr = self.prefix + chr

        if self.keep_until_done:  #return if loaded already
            fasta_seq = self.all_chr.get(chr)
        else:  #remove if loaded already
            fasta_seq = self.all_chr.pop(chr, None)
        if fasta_seq:
            (header, seq) = fasta_seq
            logging.debug('return %s' % header)
            return fasta_seq
        curr_chr = ''
        seq = ''
        while not curr_chr == chr:
            fasta_seq = self.reader.next()
            if fasta_seq:
                (header, seq) = fasta_seq
                logging.debug('load %s' % header)
                curr_chr = header.split()[0]
                if not curr_chr.startswith(self.prefix):
                    curr_chr = self.prefix + curr_chr
                if (self.keep_in_memory
                        and not curr_chr == chr) or self.keep_until_done:
                    logging.debug('keep %s' % header)
                    self.all_chr[curr_chr] = fasta_seq
            else:
                break
        return fasta_seq

    def next(self):
        fasta_seq = None
        if len(self.all_chr) > 0:
            chr = self.all_chr.keys[0]
        if len(self.all_chr) > 0:
            chr = self.all_chr.keys()[0]
            if self.keep_until_done:  #return if loaded already
                fasta_seq = self.all_chr.get(chr)
            else:  #remove if loaded already
                fasta_seq = self.all_chr.pop(chr, None)
        if not fasta_seq:
            fasta_seq = self.reader.next()
            if fasta_seq:
                (header, seq) = fasta_seq
                curr_chr = header.split()[0]
                logging.debug('load %s' % header)
                if self.keep_until_done:
                    logging.debug('keep %s' % header)
                    self.all_chr[curr_chr] = fasta_seq
        if fasta_seq:
            logging.debug('return %s' % header)
            return fasta_seq
        return None

    def __iter__(self):
        return iter(self.next, None)

    def __del__(self):
        self.open_genome_file.close()
        self.all_chr = None

    def close(self):
        self.__del__()
Beispiel #16
0
class GenomeLoader:
    """Genome Loader take a fasta file and try to find a given chromosome in it.
    You can specify so it also keep the all the loaded unused chromosome into memory until they are required (keep_in_memory=True).
    You can also specify so it keep the all the loaded chromosome until the object is destroyed (keep_until_done=True).
    You can provide a prefix that will be added to the chromsome names if not already there.
    """
    def __init__(self,genome_file, keep_in_memory=True, keep_until_done=False, prefix=''):
        self.open_genome_file=open_input_file(genome_file)
        self.reader=FastaReader(self.open_genome_file)
        self.keep_in_memory=keep_in_memory
        self.keep_until_done=keep_until_done
        self.prefix=prefix
        self.all_chr={}
    
    def load_all(self):
        self.get_chr('***********************************************')
        return self.all_chr.keys()
    
    def get_chr(self, chr):
        if not chr.startswith(self.prefix):
            chr=self.prefix+chr
            
        if self.keep_until_done: #return if loaded already
            fasta_seq=self.all_chr.get(chr)
        else:               #remove if loaded already
            fasta_seq=self.all_chr.pop(chr,None)
        if fasta_seq:
            (header,seq)=fasta_seq
            logging.debug('return %s'%header)
            return fasta_seq
        curr_chr=''
        seq=''
        while not curr_chr==chr:
            fasta_seq=self.reader.next()
            if fasta_seq:
                (header,seq)=fasta_seq
                logging.debug('load %s'%header)
                curr_chr=header.split()[0]
                if not curr_chr.startswith(self.prefix):
                    curr_chr=self.prefix+curr_chr
                if (self.keep_in_memory and not curr_chr==chr) or self.keep_until_done:
                    logging.debug('keep %s'%header)
                    self.all_chr[curr_chr]=fasta_seq
            else: break
        return fasta_seq
    
    def next(self):
        fasta_seq=None
        if len(self.all_chr)>0:
            chr=self.all_chr.keys[0]
        if len(self.all_chr)>0:
            chr=self.all_chr.keys()[0]
            if self.keep_until_done: #return if loaded already
                fasta_seq=self.all_chr.get(chr)
            else:               #remove if loaded already
                fasta_seq=self.all_chr.pop(chr,None)
        if not fasta_seq:
            fasta_seq=self.reader.next()
            if fasta_seq:
                (header,seq)=fasta_seq
                curr_chr=header.split()[0]
                logging.debug('load %s'%header)
                if self.keep_until_done:
                    logging.debug('keep %s'%header)
                    self.all_chr[curr_chr]=fasta_seq
        if fasta_seq:
            logging.debug('return %s'%header)
            return fasta_seq
        return None
    
    def __iter__(self):
        return iter(self.next, None)
    
    def __del__(self):
        self.open_genome_file.close()
        self.all_chr=None
        
    def close(self):
        self.__del__()