def __init__(self, maf_file, blat_alignment_file, corrected_error_rate, uncorrected_error_rate): self.corrected_error_rate = corrected_error_rate self.uncorrected_error_rate = uncorrected_error_rate self.maf_file = maf_file self.maf_dictionary = AssessUtil.read_maf_align(self.maf_file) self.blat_align = blat_alignment_file
def get_corrected_read_coordinate(self): corrected_read_dictionary = {} with open(self.blat_result, 'r') as input_data: for line in input_data: line_split = line.split() cor_seq_name = line_split[9] original_seq_name = line_split[13] begin = int(line_split[15]) end = int(line_split[16]) if cor_seq_name.split('.', 1)[0] == original_seq_name: # convert to original chromosome coordinate begin = begin + self.original_read_coordinate[ original_seq_name][0] end = end + self.original_read_coordinate[ original_seq_name][0] if original_seq_name in corrected_read_dictionary: corrected_read_dictionary[original_seq_name].append( [begin, end]) else: corrected_read_dictionary[original_seq_name] = [] corrected_read_dictionary[original_seq_name].append( [begin, end]) return AssessUtil.sort_dictionary_values( unsorted_dictionary=corrected_read_dictionary)
def __init__(self, maf_file, working_dir, corrected_file=None): self.maf = maf_file if corrected_file: self.corrected_file = corrected_file else: self.corrected_file = "" self.work_dir = working_dir if not os.path.exists(self.work_dir): os.makedirs(self.work_dir) self.original_seq_file = self.work_dir + "/original_seq.fa" # this returns file name for a dictionary contains read and coordinate self.original_coordinate = AssessUtil.extract_original_read_maf( maf_file=self.maf, maf_fasta=self.original_seq_file) self.original_coordinate = pickle.load( open(self.original_coordinate, "rb")) self.original_sequence = Fasta(self.original_seq_file)
def get_corrected_read_coordinate(self): corrected_read_dictionary = {} print(os.path.isfile(self.output)) with open(self.output, 'r') as input_data: for line in input_data: line_split = line.split() cor_seq_name = line_split[9] original_seq_name = line_split[13] begin = line_split[15] end = line_split[16] if cor_seq_name.split('.', 1)[0] == original_seq_name: begin += self.original_coordinate[original_seq_name][0] end += self.original_coordinate[original_seq_name][0] if original_seq_name in corrected_read_dictionary: corrected_read_dictionary[original_seq_name].append( [begin, end]) else: corrected_read_dictionary[original_seq_name] = [] corrected_read_dictionary[original_seq_name].append( [begin, end]) return AssessUtil.sort_dictionary_values( unsorted_dictionary=corrected_read_dictionary)
def get_correction_coordinate(self): corrected_read_dictionary = {} target_length = {} with open(self.blat_align, 'r') as input_data: for line in input_data: line_split = line.split() cor_seq_name = line_split[9] original_seq_name = line_split[13] if cor_seq_name.split('.', 1)[0] == original_seq_name: begin = int(line_split[15]) end = int(line_split[16]) if original_seq_name in corrected_read_dictionary: corrected_read_dictionary[original_seq_name].append( [begin, end]) else: corrected_read_dictionary[original_seq_name] = [] corrected_read_dictionary[original_seq_name].append( [begin, end]) target_length[original_seq_name] = [ 0, int(line_split[14]) ] return AssessUtil.sort_dictionary_values( unsorted_dictionary=corrected_read_dictionary), target_length
def get_uncorrected_coordinate(cls, corrected_coordinate_dict, original_coordinate_dict): return AssessUtil.uncorrected_interval( corrected_coordinate=corrected_coordinate_dict, original_seq_length=original_coordinate_dict)
"--fullLength", help="Proovread untrimmed unsplitted file", type=str, required=True) parser.add_argument("-o", "--outputFile", help="A file that contains uncorrected PacBio reads", type=str, required=True) parser.add_argument( "-m", "--minimumLength", help="The minimum length of uncorrected PacBio reads default 50", type=str, required=False) args = parser.parse_args() corrected = args.correctedReads full_reads = args.fullLength output = args.outputFile minimum = args.minimumLength if not minimum: minimum = 50 if __name__ == '__main__': AssessUtil.extract_uncorrected_reads_proovread( trimmed_corrected_reads=corrected, full_corrected_reads=full_reads, output_file=output, threshold=minimum)
def get_maf_sign_dictionary(self): maf_dict = AssessUtil() return maf_dict.read_maf_sign(self.maf_file)
from AssessUtil import AssessUtil import argparse import time from pyfaidx import Fasta parser = argparse.ArgumentParser(description='Script to calculate complexity and GC content of read') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01') parser.add_argument("-i", "--input", help="Fatsa reads file", type=str, required=True) parser.add_argument("-o", "--outputFile", help="output reads file", type=str, required=True) args = parser.parse_args() input_file = args.input output_file = args.outputFile if __name__ == '__main__': with Fasta(input_file) as data_in, open(output_file, 'w') as data_out: data_out.write("read_name\tGC\tcomplexity\n") for line in data_in: seq_name = line.name seq = str(line) gc = AssessUtil.gc_content(seq) complexity = AssessUtil.complexity(seq) data_out.write("{}\t{}\t{}\n".format(seq_name, gc, complexity)) print('Done {} {}'.format(time.strftime("%d/%m/%Y"), time.strftime("%I:%M:%S")))
def get_uncorrected_coordinate(self): intervals_of_uncorrected_reads = AssessUtil.uncorrected_interval( corrected_coordinate=self.get_corrected_read_coordinate(), original_seq_length=self.original_read_coordinate) return intervals_of_uncorrected_reads
def get_uncorrected_coordinate(self, dictionary_of_corrected_reads): intervals_of_uncorrected_reads = AssessUtil.uncorrected_interval( corrected_coordinate=dictionary_of_corrected_reads, original_seq_length=self.original_coordinate) return intervals_of_uncorrected_reads
from AssessUtil import AssessUtil parser = argparse.ArgumentParser( description='Script to extract original read from maf file') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01') parser.add_argument( "-m", "--mafFile", help= "A maf file that contains the alignment of original and artificial data", type=str, required=True) parser.add_argument( "-o", "--outputFile", help="A file that contains assessment of the corrected PacBio reads", type=str, required=True) args = parser.parse_args() maf = args.mafFile result = args.outputFile if __name__ == "__main__": original_coordinate = AssessUtil.extract_original_read_maf( maf_file=maf, maf_fasta=result) print("Dictionary with the coordinate in: {}".format(original_coordinate))