Esempio n. 1
0
def correct_meta_references(ref_fpaths,
                            corrected_dirpath,
                            downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath,
                                      qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references,
                     ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(
                os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[
            corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name,
                                    qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(
                uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references,
                ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' +
                             qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath,
                                    fastaparser.read_fasta(corr_seq_fpath),
                                    'a')
        elif downloaded_refs:
            logger.warning(
                'Skipping ' + ref_fpath + ' because it'
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!'
            )
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error(
                'Reference file ' + ref_fpath +
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' +
                         qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Esempio n. 2
0
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a')
        elif downloaded_refs:
            logger.warning('Skipping ' + ref_fpath + ' because it'
                           ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!')
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error('Reference file ' + ref_fpath +
                         ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                         exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' + qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Esempio n. 3
0
def correct_meta_references(ref_fpaths, corrected_dirpath):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)
    dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1]

    for ref_fpath in ref_fpaths:
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        if ref_name in dupl_ref_names:
            ref_name = qutils.get_label_from_par_dir_and_fname(ref_fpath)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')

    logger.main_info('  All references combined in ' + qconfig.combined_ref_name)

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths