def simple_validate(divided: dict, r_ref_gb: Path, r_ref_fasta: Path, arg): """ For chloroplast genomes WITHOUT 4-parts structure """ for i in divided: success = False divided[i]['success'] = success if divided[i]['skip']: continue i_gb = divided[i]['gb'] i_fasta = divided[i]['fasta'] log.info(f'Analyze {i_fasta}.') option_regions = utils.get_regions(i_gb) # add regions info for _ in option_regions: divided[i][_] = len(option_regions[_]) compare_result = compare_seq(i_fasta, r_ref_fasta, arg.tmp, arg.perc_identity) if compare_result is None: log.critical('Cannot run BLAST.') log.debug(f'{arg.input} {arg.ref} BLAST_FAIL\n') return None pdf = simple_draw(r_ref_gb, i_gb, compare_result) utils.move(pdf, arg.out/pdf.name) log.debug('Skip detecting reverse complement region.') utils.move(i_fasta, i_fasta.with_suffix('.fasta')) success = True divided[i]['success'] = success return divided
def compare_seq(query, reference, tmp, perc_identity): """ Use BLAST to compare two records. Args: query(Path or str): query file reference(Path or str): reference file tmp(Path): temp folder perc_identity(float): percent identity for BLAST, need multiply 100 Return: results[0][1]: BLAST data """ results = [] blast_result, blast_log = utils.blast(Path(query), reference, perc_identity*100) if blast_result is None: return None # only one record in file, loop is for unpack for query in utils.parse_blast_tab(blast_result): record = [] for i in query: (qseqid, sseqid, sstrand, qlen, slen, length, pident, gapopen, qstart, qend, sstart, send) = i record.append([qstart, qend, sstart, send, sstrand, pident]) results.append(record) utils.move(blast_result, tmp/blast_result.name) utils.move(blast_log, tmp/blast_log.name) if len(results) == 0: return None else: return results[0]
def divide_records(fasta: Path, output: Path, ref_len: int, tmp: Path, len_diff=0.1, simple_validate=False): """ Make sure each file has only one record. Args: fasta(Path): fasta file output(Path): output folder ref_len(int): length of reference, to filter bad records tmp(Path): temp folder len_diff: maximum allowed length difference simple_validate(bool): skip normal rotate or not Returns: divided(dict): info of divided files """ options = list(SeqIO.parse(fasta, 'fasta')) divided = {} keys = ('gb,fasta,length,LSC,IRa,SSC,IRb,missing,incomplete,' 'rc,figure,figure_after,skip').split(',') log.info(f'Found {len(options)} records in {fasta.name}.') if len(options) > 1: log.info('Divide them into different files.') log.info(f"Check record's length (reference: {ref_len} bp, " f"difference limit {len_diff:.2%}).") for idx, record in enumerate(options): skip = False if len(options) > 1: filename = output / f'{fasta.stem}_{idx+1}{fasta.suffix}' else: filename = output / fasta.name divided[filename] = dict((key, '') for key in keys) record_len = len(record) record_len_diff = (record_len/ref_len) - 1 divided[filename]['fasta'] = filename divided[filename]['length'] = record_len divided[filename]['length_diff'] = record_len_diff if abs(record_len_diff) > len_diff: log.warning(f'Skip NO.{idx+1} record ({record_len} bp, ' f'length difference {record_len_diff:.2%}).') skip = 'undersize' if record_len_diff < 0 else 'oversize' SeqIO.write(record, filename, 'fasta') if not skip: r_gb, r_fasta = utils.rotate_seq( filename, tmp=tmp, simple_validate=simple_validate) if r_gb is not None: divided[filename].update({'gb': r_gb, 'fasta': r_fasta, 'length': record_len}) utils.move(filename, tmp/filename.with_suffix('.raw').name) else: skip = 'structure_unusual' divided[filename]['skip'] = skip log.debug(f'{skip}') return divided
def process_ref(arg): """ Preprocess reference Prefer ref because assembly passed ref Returns: r_ref_gb(Path): rotated gb r_ref_fasta(Path): rotated fasta ref_len(int): length of the reference """ r_ref_gb = None r_ref_fasta = None ref_len = 0 if arg.ref is not None: log.info(f'Reference:\t{arg.ref}') fmt = utils.get_fmt(arg.ref) ref_gb = Path(arg.ref).absolute() ref_gb = utils.move(ref_gb, arg.tmp/ref_gb.name, copy=True) ref_records = list(SeqIO.parse(ref_gb, fmt)) if len(ref_records) > 1: log.warning('Given reference contains more than one records, ' 'only use the first.') # assume given reference is ok since user refuse to auto download # reference from Genbank SeqIO.write(ref_records[0], ref_gb, fmt) else: log.info(f'Taxonomy:\t{arg.taxon}') ref_gb, ref_taxon = utils.get_ref(arg.taxon, arg.tmp) if ref_gb is None: log.critical('Failed to get reference.') log.debug(f'{arg.input} {arg.ref} REF_NOT_FOUND\n') return r_ref_gb, r_ref_fasta, ref_len ref_gb = utils.move(ref_gb, arg.tmp/ref_gb.name) fmt = 'gb' log.info(f'Output:\t {arg.out}') ref_len = len(SeqIO.read(ref_gb, fmt)) r_ref_gb, r_ref_fasta = utils.rotate_seq( ref_gb, tmp=arg.tmp, simple_validate=arg.simple_validate) if r_ref_gb is None: log.critical('Cannot process reference sequence.') log.critical('Please consider to use another reference.') log.debug(f'{arg.input} {arg.ref} REF_CANNOT_ROTATE\n') return r_ref_gb, r_ref_fasta, ref_len return r_ref_gb, r_ref_fasta, ref_len
def split(raw, number, output: Path): """ Split reads of original file from the beginning. If set number to default('inf'), extract all reads. If number is "inf" and format is not gz, skip split. Args: raw(str or Path): input file, coulde be fastq or gz format number(int or 'inf'): number of reads to split, 'inf' for no limit output(Path): output folder Return: splitted(Path): splitted file, fastq format count(int): reads actually got, 0 for not split """ raw = Path(raw).absolute() fmt = utils.get_fmt(raw) if fmt != 'gz' and number == float('inf'): log.debug('Skip split for "inf" and non-gz.') return raw, 0 splitted = output / raw.with_suffix(f'.{number}').name splitted_handle = open(splitted, 'wb') if fmt == 'gz': raw_handle = gzip.open(raw) else: raw_handle = open(raw, 'rb') line = iter(raw_handle) count = 0 while count < number: # four line one record try: splitted_handle.write(next(line)) splitted_handle.write(next(line)) splitted_handle.write(next(line)) splitted_handle.write(next(line)) except StopIteration: break count += 1 raw_handle.close() splitted_handle.close() splitted = utils.move(splitted, splitted.with_suffix(f'.{count}')) if number != float('inf') and number != count: log.warning(f'Want {number} reads, acutally got {count}.') return splitted, count
def assembly(arg, perl, novoplasty): """ Assembly input file by wrapping NOVOPlasty. The way to find absolute path of perl seems not good enough (I forget why but I remember I tried). Args: arg(NameSpace): arguments perl(str): perl location, may not be absolute path novoplasty(Path): novoplasty file Return: success(bool): success or not """ def _patient_log(): # hint user every 30s to avoid long time boring waiting n = 0 while novoplasty_is_running: sleep(1) n += 1 if n >= 30: log.info('NOVOPlasty is running, please be patient...') n = 0 return success = False log.info('') for i in arg.input: test = Path(i) # arg.list_file may contains invalid file if not test.exists(): log.critical(f'Cannot find input file {i}') return success else: log.info(f'Input file: {i}') log.info(f'Minimum genome size: {arg.min}') log.info(f'Maximum genome size: {arg.max}') if arg.ref is not None: log.info(f'Reference: {arg.ref}') elif isinstance(arg.taxon, list): t_ = ' '.join(arg.taxon) log.info(f'Taxonomy: {t_}') else: log.info(f'Taxonomy: {arg.taxon}') log.info(f'Output folder: {arg.out}') # split # equal to zero or not, expose to user # equal to inf or not, hide inside have_gz = ('gz' in [utils.get_fmt(i) for i in arg.input]) if arg.split != 0: log.info(f'Split {arg.split} pairs of reads for assembly') splitted = [] for raw in arg.input: new, count = split(raw, arg.split, arg.tmp) splitted.append(new) arg.input = splitted # novoplasty calls gzip, which Windows does not have elif platform.system() == 'Windows' and have_gz: log.debug(f'Split for gz on Windows.') splitted = [] for raw in arg.input: new, count = split(raw, float('inf'), arg.tmp) splitted.append(new) arg.input = splitted # get ref if arg.ref is not None: if utils.get_fmt(arg.ref) != 'gb': log.critical(f'Reference file should be genbank format, ' f'but {arg.ref} is not.') return success ref = Path(arg.ref).absolute() ref = utils.move(ref, arg.tmp/ref.name, copy=True) else: # for "Genus species var. blabla", ignore subspecies words if isinstance(arg.taxon, str): pass elif isinstance(arg.taxon, list): arg.taxon = ' '.join(arg.taxon[:2]) else: pass ref, arg.taxon = utils.get_ref(arg.taxon, arg.tmp, mt_mode=arg.mt_mode, simple_validate=arg.simple_validate) if ref is None: log.critical('Cannot get reference.') return success else: ref = utils.move(ref, arg.tmp/ref.name) # get seed seeds = [] ordered_seeds = get_seed(ref, arg.raw, arg.seed) if arg.seed_file is not None: seeds.append(Path(arg.seed_file).absolute()) # only add whole.seed seeds.append(ordered_seeds[-1]) else: seeds.extend(ordered_seeds) if len(seeds) == 0: log.critical('Cannot get seeds!') return success csv_files = [] all_contigs = [] for seed in seeds: log.info(f'Use {seed.stem} as seed.') config_file = config(seed, arg) log.info('Call NOVOPlasty... May need minutes (rarely half an hour)') # use mark to terminate thread novoplasty_is_running = True hint = Thread(target=_patient_log) hint.start() # ignore bad returncode run(f'{perl} {novoplasty} -c {config_file}', shell=True, stdout=DEVNULL, stderr=DEVNULL) novoplasty_is_running = False # novoplasty use current folder as output folder circularized, options, merged, contigs = organize_out(arg, seed.stem) all_contigs.extend(contigs) if len(circularized) == 0 and len(options) == 0 and len(merged) == 0: log.warning(f'Assembled with {seed.stem} failed.') continue validated = [] log.info('Validate assembly results.') # validate merged or not? for i in (*circularized, *options): arg_str = (f'-input {i} -ref {ref} -seed {seed.stem} ' f'-out {arg.out} ' f'-perc_identity {arg.perc_identity} ' f'-len_diff {arg.len_diff}') # if mt, simple validate if arg.simple_validate: arg_str += ' -simple_validate' validate_file, report = validate_main(arg_str) validated.extend(validate_file) if report not in csv_files: csv_files.append(report) if len(validated) != 0: success = True break else: log.warning('No records passed validation.') if not success: log.critical(f'Assembly with {seed.stem} failed.') if not success: log.info('Failed with all seeds.') all_input = ' '.join([str(i.absolute()) for i in all_contigs]) # '' means empty if len(all_input) == 0: return success log.info('Try to assembly contigs generated from each seed.') arg_str = f'-input {all_input} -o {arg.out/"Raw"/"merge_seed.fasta"}' n_assembly, assembly_result = merge_main(arg_str) if n_assembly != 0: arg_str = (f'-input {assembly_result} -ref {ref} -seed merge ' f'-out {arg.out} ' f'-perc_identity {arg.perc_identity} ' f'-len_diff {arg.len_diff}') if arg.simple_validate: arg_str += ' -simple_validate' validate_file, report = validate_main(arg_str) if len(validate_file) != 0: success = True csv_files.append(report) return success
def organize_out(arg, seed: str): """ Organize NOVOPlasty output. log*: log file contigs_tmp*: temporary files Contigs*: contigs Merged*: merged contigs, may be circular or empty, contains options Option*: merged contigs, circular or incomplete circular Circularized*: circularized sequence Return fasta list. Arg: arg(NameSpace): args seed(str): seed gene's name Return: contigs(list): contig files merged(list): merged files options(list): options files circularized(list): circularized files """ def txt_to_fasta(old): """ Convert NOVOPlasty-generated txt to standard fasta. """ clean = [] record = [] begin = False with open(old, 'r') as raw: for line in raw: if line.startswith('>'): clean.extend(record) record = [] begin = True if line.startswith(' ') or len(line.strip()) == 0: begin = False clean.extend(record) record = [] if begin: record.append(line) clean.extend(record) new = Path(old).with_suffix('.fasta') with open(new, 'w') as output: for line in clean: output.write(line.replace('*', '')) return new for i in arg.out.glob('contigs_tmp_*.txt'): i = i.absolute() utils.move(i, arg.tmp/i.with_name(f'{i.stem}-{seed}{i.suffix}').name) for i in arg.out.glob('log_*.txt'): i = i.absolute() utils.move(i, arg.log/i.with_name('NOVOPlasty-'+i.name).name) contigs = [] for i in arg.out.glob('Contigs_*.fasta'): i = i.absolute() i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name) contigs.append(i) merged = [] for i in arg.out.glob('Merged_contigs_*.txt'): i = i.absolute() i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name) fasta = txt_to_fasta(i) fasta = utils.move(fasta, arg.raw/fasta.name) merged.append(fasta) options = [] for i in arg.out.glob('Option_*.fasta'): i = i.absolute() i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name) options.append(i) circularized = [] for i in arg.out.glob('Circularized_assembly*.fasta'): i = i.absolute() i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name) circularized.append(i) return circularized, options, merged, contigs
def normal_validate(divided: dict, r_ref_gb: Path, r_ref_fasta: Path, arg): """ For chloroplast genomes with 4-parts structure """ for i in divided: success = False divided[i]['success'] = success if divided[i]['skip']: continue i_gb = divided[i]['gb'] i_fasta = divided[i]['fasta'] log.info(f'Analyze {i_fasta}.') option_regions = utils.get_regions(i_gb) # add regions info for _ in option_regions: divided[i][_] = len(option_regions[_]) compare_result = compare_seq(i_fasta, r_ref_fasta, arg.tmp, arg.perc_identity) if compare_result is None: log.critical('Bad BLAST results.') log.debug(f'{arg.input} {arg.ref} BLAST_FAIL\n') return None pdf = draw(r_ref_gb, i_gb, compare_result) utils.move(pdf, arg.out/pdf.name) log.info('Detecting reverse complement region.') option_len = divided[i]['length'] count, to_rc, strand_info, bad_region = validate_regions( option_len, option_regions, compare_result, arg.perc_identity) divided[i].update(strand_info) if bad_region: # skip sequences with bad region continue if to_rc is not None: log.warning(f'Reverse complement the {to_rc} of {i_fasta.name}.') rc_fasta = utils.rc_regions(i_gb, to_rc) # clean old files utils.move(i_fasta, arg.tmp/(i_fasta.with_name( i_fasta.stem+'-noRC.fasta')).name) utils.move(i_gb, arg.tmp/(i_gb.with_name( i_gb.stem+'-noRC.gb')).name) rc_fasta = utils.move(rc_fasta, rc_fasta.with_suffix('')) r_rc_gb, r_rc_fasta = utils.rotate_seq( rc_fasta, tmp=arg.tmp, simple_validate=arg.simple_validate) if r_rc_gb is None: continue rc_fasta.unlink() r_rc_gb = utils.move(r_rc_gb, arg.out/r_rc_gb.with_name( r_rc_gb.stem+'_RC.gb').name) r_rc_fasta = utils.move(r_rc_fasta, arg.out/r_rc_fasta.with_name( r_rc_fasta.stem+'_RC.fasta').name) new_compare_result = compare_seq(r_rc_fasta, r_ref_fasta, arg.tmp, arg.perc_identity) pdf = draw(r_ref_gb, r_rc_gb, new_compare_result) utils.move(pdf, arg.out/pdf.name) divided[i]['fasta'] = r_rc_fasta new_regions = utils.get_regions(r_rc_gb) for _ in new_regions: divided[i][_] = len(new_regions[_]) # validate again count_2, to_rc_2, *_ = validate_regions( option_len, new_regions, new_compare_result, arg.perc_identity) if to_rc_2 is None: success = True else: utils.move(i_fasta, i_fasta.with_suffix('.fasta')) success = True divided[i]['success'] = success return divided