Beispiel #1
0
def simple_validate(divided: dict, r_ref_gb: Path, r_ref_fasta: Path, arg):
    """
    For chloroplast genomes WITHOUT 4-parts structure
    """
    for i in divided:
        success = False
        divided[i]['success'] = success
        if divided[i]['skip']:
            continue
        i_gb = divided[i]['gb']
        i_fasta = divided[i]['fasta']
        log.info(f'Analyze {i_fasta}.')
        option_regions = utils.get_regions(i_gb)
        # add regions info
        for _ in option_regions:
            divided[i][_] = len(option_regions[_])
        compare_result = compare_seq(i_fasta, r_ref_fasta, arg.tmp,
                                     arg.perc_identity)
        if compare_result is None:
            log.critical('Cannot run BLAST.')
            log.debug(f'{arg.input} {arg.ref} BLAST_FAIL\n')
            return None
        pdf = simple_draw(r_ref_gb, i_gb, compare_result)
        utils.move(pdf, arg.out/pdf.name)
        log.debug('Skip detecting reverse complement region.')
        utils.move(i_fasta, i_fasta.with_suffix('.fasta'))
        success = True
        divided[i]['success'] = success
        return divided
Beispiel #2
0
def compare_seq(query, reference, tmp, perc_identity):
    """
    Use BLAST to compare two records.
    Args:
        query(Path or str): query file
        reference(Path or str): reference file
        tmp(Path): temp folder
        perc_identity(float): percent identity for BLAST, need multiply 100
    Return:
        results[0][1]: BLAST data
    """
    results = []
    blast_result, blast_log = utils.blast(Path(query), reference,
                                          perc_identity*100)
    if blast_result is None:
        return None
    # only one record in file, loop is for unpack
    for query in utils.parse_blast_tab(blast_result):
        record = []
        for i in query:
            (qseqid, sseqid, sstrand, qlen, slen, length, pident, gapopen,
             qstart, qend, sstart, send) = i
            record.append([qstart, qend, sstart, send, sstrand, pident])
        results.append(record)
    utils.move(blast_result, tmp/blast_result.name)
    utils.move(blast_log, tmp/blast_log.name)
    if len(results) == 0:
        return None
    else:
        return results[0]
Beispiel #3
0
def divide_records(fasta: Path, output: Path, ref_len: int,
                   tmp: Path, len_diff=0.1, simple_validate=False):
    """
    Make sure each file has only one record.
    Args:
        fasta(Path): fasta file
        output(Path): output folder
        ref_len(int): length of reference, to filter bad records
        tmp(Path): temp folder
        len_diff: maximum allowed length difference
        simple_validate(bool): skip normal rotate or not
    Returns:
        divided(dict): info of divided files
    """
    options = list(SeqIO.parse(fasta, 'fasta'))
    divided = {}
    keys = ('gb,fasta,length,LSC,IRa,SSC,IRb,missing,incomplete,'
            'rc,figure,figure_after,skip').split(',')
    log.info(f'Found {len(options)} records in {fasta.name}.')
    if len(options) > 1:
        log.info('Divide them into different files.')
    log.info(f"Check record's length (reference: {ref_len} bp, "
             f"difference limit {len_diff:.2%}).")
    for idx, record in enumerate(options):
        skip = False
        if len(options) > 1:
            filename = output / f'{fasta.stem}_{idx+1}{fasta.suffix}'
        else:
            filename = output / fasta.name
        divided[filename] = dict((key, '') for key in keys)
        record_len = len(record)
        record_len_diff = (record_len/ref_len) - 1
        divided[filename]['fasta'] = filename
        divided[filename]['length'] = record_len
        divided[filename]['length_diff'] = record_len_diff
        if abs(record_len_diff) > len_diff:
            log.warning(f'Skip NO.{idx+1} record ({record_len} bp, '
                        f'length difference {record_len_diff:.2%}).')
            skip = 'undersize' if record_len_diff < 0 else 'oversize'
        SeqIO.write(record, filename, 'fasta')
        if not skip:
            r_gb, r_fasta = utils.rotate_seq(
                filename, tmp=tmp, simple_validate=simple_validate)
            if r_gb is not None:
                divided[filename].update({'gb': r_gb, 'fasta': r_fasta,
                                          'length': record_len})
                utils.move(filename, tmp/filename.with_suffix('.raw').name)
            else:
                skip = 'structure_unusual'
        divided[filename]['skip'] = skip
        log.debug(f'{skip}')
    return divided
Beispiel #4
0
def process_ref(arg):
    """
    Preprocess reference
    Prefer ref because assembly passed ref
    Returns:
        r_ref_gb(Path): rotated gb
        r_ref_fasta(Path): rotated fasta
        ref_len(int): length of the reference
    """
    r_ref_gb = None
    r_ref_fasta = None
    ref_len = 0
    if arg.ref is not None:
        log.info(f'Reference:\t{arg.ref}')
        fmt = utils.get_fmt(arg.ref)
        ref_gb = Path(arg.ref).absolute()
        ref_gb = utils.move(ref_gb, arg.tmp/ref_gb.name, copy=True)
        ref_records = list(SeqIO.parse(ref_gb, fmt))
        if len(ref_records) > 1:
            log.warning('Given reference contains more than one records, '
                        'only use the first.')
            # assume given reference is ok since user refuse to auto download
            # reference from Genbank
            SeqIO.write(ref_records[0], ref_gb, fmt)
    else:
        log.info(f'Taxonomy:\t{arg.taxon}')
        ref_gb, ref_taxon = utils.get_ref(arg.taxon, arg.tmp)
        if ref_gb is None:
            log.critical('Failed to get reference.')
            log.debug(f'{arg.input} {arg.ref} REF_NOT_FOUND\n')
            return r_ref_gb, r_ref_fasta, ref_len
        ref_gb = utils.move(ref_gb, arg.tmp/ref_gb.name)
        fmt = 'gb'
    log.info(f'Output:\t {arg.out}')
    ref_len = len(SeqIO.read(ref_gb, fmt))
    r_ref_gb, r_ref_fasta = utils.rotate_seq(
        ref_gb, tmp=arg.tmp, simple_validate=arg.simple_validate)
    if r_ref_gb is None:
        log.critical('Cannot process reference sequence.')
        log.critical('Please consider to use another reference.')
        log.debug(f'{arg.input} {arg.ref} REF_CANNOT_ROTATE\n')
        return r_ref_gb, r_ref_fasta, ref_len
    return r_ref_gb, r_ref_fasta, ref_len
Beispiel #5
0
def split(raw, number, output: Path):
    """
    Split reads of original file from the beginning.
    If set number to default('inf'), extract all reads.
    If number is "inf" and format is not gz, skip split.
    Args:
        raw(str or Path): input file, coulde be fastq or gz format
        number(int or 'inf'): number of reads to split, 'inf' for no limit
        output(Path): output folder
    Return:
        splitted(Path): splitted file, fastq format
        count(int): reads actually got, 0 for not split
    """
    raw = Path(raw).absolute()
    fmt = utils.get_fmt(raw)
    if fmt != 'gz' and number == float('inf'):
        log.debug('Skip split for "inf" and non-gz.')
        return raw, 0
    splitted = output / raw.with_suffix(f'.{number}').name
    splitted_handle = open(splitted, 'wb')
    if fmt == 'gz':
        raw_handle = gzip.open(raw)
    else:
        raw_handle = open(raw, 'rb')
    line = iter(raw_handle)
    count = 0
    while count < number:
        # four line one record
        try:
            splitted_handle.write(next(line))
            splitted_handle.write(next(line))
            splitted_handle.write(next(line))
            splitted_handle.write(next(line))
        except StopIteration:
            break
        count += 1
    raw_handle.close()
    splitted_handle.close()
    splitted = utils.move(splitted, splitted.with_suffix(f'.{count}'))
    if number != float('inf') and number != count:
        log.warning(f'Want {number} reads, acutally got {count}.')
    return splitted, count
Beispiel #6
0
def assembly(arg, perl, novoplasty):
    """
    Assembly input file by wrapping NOVOPlasty.
    The way to find absolute path of perl seems not good enough (I forget why
    but I remember I tried).
    Args:
        arg(NameSpace): arguments
        perl(str): perl location, may not be absolute path
        novoplasty(Path): novoplasty file
    Return:
        success(bool): success or not
    """
    def _patient_log():
        # hint user every 30s to avoid long time boring waiting
        n = 0
        while novoplasty_is_running:
            sleep(1)
            n += 1
            if n >= 30:
                log.info('NOVOPlasty is running, please be patient...')
                n = 0
        return

    success = False
    log.info('')
    for i in arg.input:
        test = Path(i)
        # arg.list_file may contains invalid file
        if not test.exists():
            log.critical(f'Cannot find input file {i}')
            return success
        else:
            log.info(f'Input file: {i}')
    log.info(f'Minimum genome size: {arg.min}')
    log.info(f'Maximum genome size: {arg.max}')
    if arg.ref is not None:
        log.info(f'Reference: {arg.ref}')
    elif isinstance(arg.taxon, list):
        t_ = ' '.join(arg.taxon)
        log.info(f'Taxonomy: {t_}')
    else:
        log.info(f'Taxonomy: {arg.taxon}')
    log.info(f'Output folder: {arg.out}')
    # split
    # equal to zero or not, expose to user
    # equal to inf or not, hide inside
    have_gz = ('gz' in [utils.get_fmt(i) for i in arg.input])
    if arg.split != 0:
        log.info(f'Split {arg.split} pairs of reads for assembly')
        splitted = []
        for raw in arg.input:
            new, count = split(raw, arg.split, arg.tmp)
            splitted.append(new)
        arg.input = splitted
    # novoplasty calls gzip, which Windows does not have
    elif platform.system() == 'Windows' and have_gz:
        log.debug(f'Split for gz on Windows.')
        splitted = []
        for raw in arg.input:
            new, count = split(raw, float('inf'), arg.tmp)
            splitted.append(new)
        arg.input = splitted
    # get ref
    if arg.ref is not None:
        if utils.get_fmt(arg.ref) != 'gb':
            log.critical(f'Reference file should be genbank format, '
                         f'but {arg.ref} is not.')
            return success
        ref = Path(arg.ref).absolute()
        ref = utils.move(ref, arg.tmp/ref.name, copy=True)
    else:
        # for "Genus species var. blabla", ignore subspecies words
        if isinstance(arg.taxon, str):
            pass
        elif isinstance(arg.taxon, list):
            arg.taxon = ' '.join(arg.taxon[:2])
        else:
            pass
        ref, arg.taxon = utils.get_ref(arg.taxon, arg.tmp, mt_mode=arg.mt_mode,
                                       simple_validate=arg.simple_validate)
        if ref is None:
            log.critical('Cannot get reference.')
            return success
        else:
            ref = utils.move(ref, arg.tmp/ref.name)
    # get seed
    seeds = []
    ordered_seeds = get_seed(ref, arg.raw, arg.seed)
    if arg.seed_file is not None:
        seeds.append(Path(arg.seed_file).absolute())
        # only add whole.seed
        seeds.append(ordered_seeds[-1])
    else:
        seeds.extend(ordered_seeds)
    if len(seeds) == 0:
        log.critical('Cannot get seeds!')
        return success
    csv_files = []
    all_contigs = []
    for seed in seeds:
        log.info(f'Use {seed.stem} as seed.')
        config_file = config(seed, arg)
        log.info('Call NOVOPlasty... May need minutes (rarely half an hour)')
        # use mark to terminate thread
        novoplasty_is_running = True
        hint = Thread(target=_patient_log)
        hint.start()
        # ignore bad returncode
        run(f'{perl} {novoplasty} -c {config_file}', shell=True,
            stdout=DEVNULL, stderr=DEVNULL)
        novoplasty_is_running = False

        # novoplasty use current folder as output folder
        circularized, options, merged, contigs = organize_out(arg, seed.stem)
        all_contigs.extend(contigs)
        if len(circularized) == 0 and len(options) == 0 and len(merged) == 0:
            log.warning(f'Assembled with {seed.stem} failed.')
            continue
        validated = []
        log.info('Validate assembly results.')
        # validate merged or not?
        for i in (*circularized, *options):
            arg_str = (f'-input {i} -ref {ref} -seed {seed.stem} '
                       f'-out {arg.out} '
                       f'-perc_identity {arg.perc_identity} '
                       f'-len_diff {arg.len_diff}')
            # if mt, simple validate
            if arg.simple_validate:
                arg_str += ' -simple_validate'
            validate_file, report = validate_main(arg_str)
            validated.extend(validate_file)
            if report not in csv_files:
                csv_files.append(report)
        if len(validated) != 0:
            success = True
            break
        else:
            log.warning('No records passed validation.')
        if not success:
            log.critical(f'Assembly with {seed.stem} failed.')
    if not success:
        log.info('Failed with all seeds.')
        all_input = ' '.join([str(i.absolute()) for i in all_contigs])
        # '' means empty
        if len(all_input) == 0:
            return success
        log.info('Try to assembly contigs generated from each seed.')
        arg_str = f'-input {all_input} -o {arg.out/"Raw"/"merge_seed.fasta"}'
        n_assembly, assembly_result = merge_main(arg_str)
        if n_assembly != 0:
            arg_str = (f'-input {assembly_result} -ref {ref} -seed merge '
                       f'-out {arg.out} '
                       f'-perc_identity {arg.perc_identity} '
                       f'-len_diff {arg.len_diff}')
            if arg.simple_validate:
                arg_str += ' -simple_validate'
            validate_file, report = validate_main(arg_str)
            if len(validate_file) != 0:
                success = True
                csv_files.append(report)
    return success
Beispiel #7
0
def organize_out(arg, seed: str):
    """
    Organize NOVOPlasty output.
        log*: log file
        contigs_tmp*: temporary files
        Contigs*: contigs
        Merged*: merged contigs, may be circular or empty, contains options
        Option*: merged contigs, circular or incomplete circular
        Circularized*: circularized sequence
    Return fasta list.
    Arg:
        arg(NameSpace): args
        seed(str): seed gene's name
    Return:
        contigs(list): contig files
        merged(list): merged files
        options(list): options files
        circularized(list): circularized files
    """
    def txt_to_fasta(old):
        """
        Convert NOVOPlasty-generated txt to standard fasta.
        """
        clean = []
        record = []
        begin = False
        with open(old, 'r') as raw:
            for line in raw:
                if line.startswith('>'):
                    clean.extend(record)
                    record = []
                    begin = True
                if line.startswith(' ') or len(line.strip()) == 0:
                    begin = False
                    clean.extend(record)
                    record = []
                if begin:
                    record.append(line)
        clean.extend(record)
        new = Path(old).with_suffix('.fasta')
        with open(new, 'w') as output:
            for line in clean:
                output.write(line.replace('*', ''))
        return new

    for i in arg.out.glob('contigs_tmp_*.txt'):
        i = i.absolute()
        utils.move(i, arg.tmp/i.with_name(f'{i.stem}-{seed}{i.suffix}').name)
    for i in arg.out.glob('log_*.txt'):
        i = i.absolute()
        utils.move(i, arg.log/i.with_name('NOVOPlasty-'+i.name).name)
    contigs = []
    for i in arg.out.glob('Contigs_*.fasta'):
        i = i.absolute()
        i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name)
        contigs.append(i)
    merged = []
    for i in arg.out.glob('Merged_contigs_*.txt'):
        i = i.absolute()
        i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name)
        fasta = txt_to_fasta(i)
        fasta = utils.move(fasta, arg.raw/fasta.name)
        merged.append(fasta)
    options = []
    for i in arg.out.glob('Option_*.fasta'):
        i = i.absolute()
        i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name)
        options.append(i)
    circularized = []
    for i in arg.out.glob('Circularized_assembly*.fasta'):
        i = i.absolute()
        i = utils.move(i, arg.raw/i.with_name(f'{i.stem}-{seed}{i.suffix}').name)
        circularized.append(i)
    return circularized, options, merged, contigs
Beispiel #8
0
def normal_validate(divided: dict, r_ref_gb: Path, r_ref_fasta: Path, arg):
    """
    For chloroplast genomes with 4-parts structure
    """
    for i in divided:
        success = False
        divided[i]['success'] = success
        if divided[i]['skip']:
            continue
        i_gb = divided[i]['gb']
        i_fasta = divided[i]['fasta']
        log.info(f'Analyze {i_fasta}.')
        option_regions = utils.get_regions(i_gb)
        # add regions info
        for _ in option_regions:
            divided[i][_] = len(option_regions[_])
        compare_result = compare_seq(i_fasta, r_ref_fasta, arg.tmp,
                                     arg.perc_identity)
        if compare_result is None:
            log.critical('Bad BLAST results.')
            log.debug(f'{arg.input} {arg.ref} BLAST_FAIL\n')
            return None
        pdf = draw(r_ref_gb, i_gb, compare_result)
        utils.move(pdf, arg.out/pdf.name)
        log.info('Detecting reverse complement region.')
        option_len = divided[i]['length']
        count, to_rc, strand_info, bad_region = validate_regions(
            option_len, option_regions, compare_result, arg.perc_identity)
        divided[i].update(strand_info)
        if bad_region:
            # skip sequences with bad region
            continue
        if to_rc is not None:
            log.warning(f'Reverse complement the {to_rc} of {i_fasta.name}.')
            rc_fasta = utils.rc_regions(i_gb, to_rc)
            # clean old files
            utils.move(i_fasta, arg.tmp/(i_fasta.with_name(
                i_fasta.stem+'-noRC.fasta')).name)
            utils.move(i_gb, arg.tmp/(i_gb.with_name(
                i_gb.stem+'-noRC.gb')).name)
            rc_fasta = utils.move(rc_fasta, rc_fasta.with_suffix(''))
            r_rc_gb, r_rc_fasta = utils.rotate_seq(
                rc_fasta, tmp=arg.tmp, simple_validate=arg.simple_validate)
            if r_rc_gb is None:
                continue
            rc_fasta.unlink()
            r_rc_gb = utils.move(r_rc_gb, arg.out/r_rc_gb.with_name(
                r_rc_gb.stem+'_RC.gb').name)
            r_rc_fasta = utils.move(r_rc_fasta, arg.out/r_rc_fasta.with_name(
                r_rc_fasta.stem+'_RC.fasta').name)
            new_compare_result = compare_seq(r_rc_fasta, r_ref_fasta, arg.tmp,
                                             arg.perc_identity)
            pdf = draw(r_ref_gb, r_rc_gb, new_compare_result)
            utils.move(pdf, arg.out/pdf.name)
            divided[i]['fasta'] = r_rc_fasta
            new_regions = utils.get_regions(r_rc_gb)
            for _ in new_regions:
                divided[i][_] = len(new_regions[_])
            # validate again
            count_2, to_rc_2, *_ = validate_regions(
                option_len, new_regions, new_compare_result, arg.perc_identity)
            if to_rc_2 is None:
                success = True
        else:
            utils.move(i_fasta, i_fasta.with_suffix('.fasta'))
            success = True
        divided[i]['success'] = success
        return divided