Ejemplo n.º 1
0
def process_ref(ref_fpaths, organism, downloaded_dirpath, max_organism_name_len, downloaded_organisms, not_founded_organisms,
                 total_downloaded, total_scored_left):
    ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta')
    spaces = (max_organism_name_len - len(organism)) * ' '
    new_ref_fpath = None
    was_downloaded = False
    if not os.path.exists(ref_fpath) and organism not in not_founded_organisms:
        new_ref_fpath = download_ref(organism, ref_fpath)
    elif os.path.exists(ref_fpath):
        was_downloaded = True
        new_ref_fpath = ref_fpath
    total_scored_left -= 1
    if new_ref_fpath:
        total_downloaded += 1
        if was_downloaded:
            logger.main_info("  %s%s | was downloaded previously (total %d, %d more to go)" %
                             (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
            if new_ref_fpath not in ref_fpaths:
                ref_fpaths.append(new_ref_fpath)
        else:
            logger.main_info("  %s%s | successfully downloaded (total %d, %d more to go)" %
                             (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
            ref_fpaths.append(new_ref_fpath)
        downloaded_organisms.append(organism)
    else:
        logger.main_info("  %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces))
        not_founded_organisms.add(organism)
    return new_ref_fpath, total_downloaded, total_scored_left
Ejemplo n.º 2
0
def parse_gff(file, feature):
    genes = []

    number = 0

    for line in file:
        m = gff_pattern.match(line.rstrip())
        if m and m.group('feature').lower() == feature:
            gene = Gene(seqname=qutils.correct_name(m.group('seqname')),
                        start=int(m.group('start')),
                        end=int(m.group('end')))

            attributes = m.group('attributes').split(';')
            for attr in attributes:
                if attr and attr != '' and '=' in attr:
                    key = attr.split('=')[0]
                    val = attr[len(key) + 1:]
                    if key.lower() == 'id':
                        gene.id = val
                    if key.lower() == 'name':
                        gene.name = val
                    gene.attributes[key.lower()] = val

            gene.number = number
            number += 1

            genes.append(gene)

    return genes
Ejemplo n.º 3
0
def parse_ncbi(ncbi_file):
    annotation_pattern = re.compile(r'Annotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)', re.I)
    chromosome_pattern = re.compile(r'Chromosome: (?P<chromosome>\S+);', re.I)
    id_pattern = re.compile(r'ID: (?P<id>\d+)', re.I)

    genes = []

    line = ncbi_file.readline()
    while line != '':
        while line.rstrip() == '' or line.startswith('##'):
            if line == '':
                break
            line = ncbi_file.readline()

        m = ncbi_start_pattern.match(line.rstrip())
        while not m:
            m = ncbi_start_pattern.match(line.rstrip())

        gene = Gene(number=int(m.group('number')),
                    name=qutils.correct_name(m.group('name')))

        the_rest_lines = []

        line = ncbi_file.readline()
        while line != '' and not ncbi_start_pattern.match(line.rstrip()):
            the_rest_lines.append(line.rstrip())
            line = ncbi_file.readline()

        for info_line in the_rest_lines:
            if info_line.startswith('Chromosome:'):
                m = re.match(chromosome_pattern, info_line)
                if m:
                    gene.chromosome = m.group('chromosome')

            if info_line.startswith('Annotation:'):
                m = re.match(annotation_pattern, info_line)
                if m:
                    gene.seqname = m.group('seqname')
                    gene.start = int(m.group('start'))
                    gene.end = int(m.group('end'))

                    to_trim = 'Chromosome' + ' ' + str(gene.chromosome)
                    if gene.chromosome and gene.seqname.startswith(to_trim):
                        gene.seqname = gene.seqname[len(to_trim):]
                        gene.seqname.lstrip(' ,')

                else:
                    logger.warning('Wrong NCBI annotation for gene ' + str(gene.number) + '. ' + gene.name + '. Skipping this gene.')

            if info_line.startswith('ID:'):
                m = re.match(id_pattern, info_line)
                if m:
                    gene.id = m.group('id')
                else:
                    logger.warning('Can\'t parse gene\'s ID in NCBI format. Gene is ' + str(gene.number) + '. ' + gene.name + '. Skipping it.')

        if gene.start is not None and gene.end is not None:
            genes.append(gene)
        # raise ParseException('NCBI format parsing error: provide start and end for gene ' + gene.number + '. ' + gene.name + '.')
    return genes
Ejemplo n.º 4
0
def find_all_sv(bed_fpath):
    if not bed_fpath:
        return None
    region_struct_variations = StructuralVariations()
    f = open(bed_fpath)
    for line in f:
        l = line.split('\t')
        if len(l) > 6 and not line.startswith('#'):
            try:
                align1 = Mapping(s1=int(l[1]), e1=int(l[2]), ref=correct_name(l[0]), s2=None, e2=None, len1=None, len2=None, idy=None, contig=None)
                align2 = Mapping(s1=int(l[4]), e1=int(l[5]),  ref=correct_name(l[3]), s2=None, e2=None, len1=None, len2=None, idy=None, contig=None)
                if align1.ref != align2.ref:
                    region_struct_variations.translocations.append((align1, align2))
                elif 'INV' in l[6]:
                    region_struct_variations.inversions.append((align1, align2))
                elif 'DEL' in l[6]:
                    region_struct_variations.relocations.append((align1, align2))
                else:
                    pass # not supported yet
            except ValueError:
                pass  # incorrect line format
    return region_struct_variations
Ejemplo n.º 5
0
def find_all_sv(bed_fpath):
    if not bed_fpath:
        return None

    region_struct_variations = StructuralVariations()
    with open(bed_fpath) as f:
        for line in f:
            fs = line.split('\t')
            if not line.startswith('#'):
                try:
                    align1 = Mapping(s1=int(fs[1]), e1=int(fs[2]), ref=correct_name(fs[0]), sv_type=fs[6])
                    align2 = Mapping(s1=int(fs[4]), e1=int(fs[5]), ref=correct_name(fs[3]), sv_type=fs[6])
                    if align1.ref != align2.ref:
                        region_struct_variations.translocations.append((align1, align2))
                    elif 'INV' in fs[6]:
                        region_struct_variations.inversions.append((align1, align2))
                    elif 'DEL' in fs[6] or 'INS' in fs[6] or 'BND' in fs[6]:
                        region_struct_variations.relocations.append((align1, align2))
                    else:
                        pass # not supported yet
                except ValueError:
                    pass  # incorrect line format
    return region_struct_variations
Ejemplo n.º 6
0
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False):
    correct_chr_names = dict()
    fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath)
    sam_chr_lengths = dict()
    sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header')
    if not isfile(sam_fpath) and not isfile(sam_header_fpath):
        return None
    if isfile(sam_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
                               stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(fasta_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()):
            if correct_name(sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == fasta_chr_lengths[fasta_chr]:
                correct_chr_names[sam_chr] = fasta_chr
            elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' +
                           'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath))
        else:
            logger.error(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' +
                         'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath))
        return None
    return correct_chr_names
Ejemplo n.º 7
0
def parse_txt(file):
    genes = []

    number = 0

    for line in file:
        line = line.rstrip()
        m = txt_pattern_gi.match(line) or txt_pattern.match(line)
        if m:
            gene = Gene(number=number,
                        seqname=qutils.correct_name(m.group('seqname')))
            number += 1
            s = int(m.group('start'))
            e = int(m.group('end'))
            gene.start = min(s, e)
            gene.end = max(s, e)
            gene.id = m.group('gene_id')
            genes.append(gene)

    return genes
Ejemplo n.º 8
0
def parse_txt(file):
    genes = []

    number = 0

    for line in file:
        line = line.rstrip()
        m = txt_pattern_gi.match(line) or txt_pattern.match(line)
        if m:
            gene = Gene(number=number,
                        seqname=qutils.correct_name(m.group('seqname')))
            number += 1
            s = int(m.group('start'))
            e = int(m.group('end'))
            gene.start = min(s, e)
            gene.end = max(s, e)
            gene.id = m.group('gene_id')
            genes.append(gene)

    return genes
Ejemplo n.º 9
0
def parse_bed(file):
    genes = []

    number = 0

    for line in file:
        fs = line.rstrip().split()
        if fs:
            seqname = fs[0]
            s = int(fs[1])
            e = int(fs[2])
            gene = Gene(number=number, seqname=qutils.correct_name(seqname))
            gene.start = min(s, e)
            gene.end = max(s, e)
            gene.id = fs[3] if len(fs) > 3 else None
            if s < e:
                gene.strand = '+'
            else:
                gene.strand = '-'
            number += 1

            genes.append(gene)

    return genes
Ejemplo n.º 10
0
def parse_bed(file):
    genes = []

    number = 0

    for line in file:
        fs = line.rstrip().split()
        if fs:
            seqname = fs[0]
            s = int(fs[1])
            e = int(fs[2])
            gene = Gene(number=number, seqname=qutils.correct_name(seqname))
            gene.start = min(s, e)
            gene.end = max(s, e)
            gene.id = fs[3] if len(fs) > 3 else None
            if s < e:
                gene.strand = '+'
            else:
                gene.strand = '-'
            number += 1

            genes.append(gene)

    return genes
Ejemplo n.º 11
0
def correct_meta_references(ref_fpaths,
                            corrected_dirpath,
                            downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath,
                                      qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references,
                     ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(
                os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[
            corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name,
                                    qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(
                uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references,
                ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' +
                             qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath,
                                    fastaparser.read_fasta(corr_seq_fpath),
                                    'a')
        elif downloaded_refs:
            logger.warning(
                'Skipping ' + ref_fpath + ' because it'
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!'
            )
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error(
                'Reference file ' + ref_fpath +
                ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' +
                         qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Ejemplo n.º 12
0
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)

    excluded_ref_fpaths = []
    ref_names = qutils.process_labels(ref_fpaths)
    for ref_fpath, ref_name in zip(ref_fpaths, ref_names):
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')
            fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a')
        elif downloaded_refs:
            logger.warning('Skipping ' + ref_fpath + ' because it'
                           ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!')
            # cleaning
            for corr_seq_name, _ in chromosomes_by_refs[ref_name]:
                del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name]
            del chromosomes_by_refs[ref_name]
            corrected_ref_fpaths.pop()
            excluded_ref_fpaths.append(ref_fpath)
        else:
            logger.error('Reference file ' + ref_fpath +
                         ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!',
                         exit_with_code=1)
    for excluded in excluded_ref_fpaths:
        ref_fpaths.remove(excluded)

    if len(chromosomes_by_refs) > 0:
        logger.main_info('  All references were combined in ' + qconfig.combined_ref_name)
    else:
        logger.warning('All references were skipped!')

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
Ejemplo n.º 13
0
def get_corr_name(name):
    return qutils.correct_name(name)
Ejemplo n.º 14
0
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath):
    if not download_blast_binaries(filenames=blast_filenames):
        return None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error('You should specify path to BLAST database obtained by running makeblastdb command: '
                         'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                         ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                         exit_with_code=2)

    elif not download_blastdb():
        return None, None

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath,
                                                        err_fpath, blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info()
    species_scores = []
    species_by_assembly = dict()
    max_entries = 4
    replacement_dict = defaultdict(list)
    for label in labels:
        assembly_scores = []
        assembly_species = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            with open(res_fpath) as res_file:
                query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None
                for line in res_file:
                    fs = line.split()
                    if line.startswith('#'):
                        refs_for_query = 0
                        # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                        if 'Fields' in line:
                            fs = line.strip().split('Fields: ')[-1].split(', ')
                            query_id_col = fs.index('query id')
                            subj_id_col = fs.index('subject id')
                            idy_col = fs.index('% identity')
                            len_col = fs.index('alignment length')
                            score_col = fs.index('bit score')
                    elif refs_for_query < max_entries and len(fs) > score_col:
                        query_id = fs[query_id_col]
                        organism_id = fs[subj_id_col]
                        idy = float(fs[idy_col])
                        length = int(fs[len_col])
                        score = float(fs[score_col])
                        if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                            seqname, taxons = parse_organism_id(organism_id)
                            if not seqname:
                                continue
                            species_name = seqname.split('_')
                            if len(species_name) > 1 and 'uncultured' not in seqname:
                                species_name = species_name[0] + '_' + species_name[1]
                                if refs_for_query == 0:
                                    if species_name not in assembly_species:
                                        assembly_scores.append((seqname, query_id, score))
                                        if taxons:
                                            taxons_for_krona[correct_name(seqname)] = taxons
                                            assembly_species.append(species_name)
                                        refs_for_query += 1
                                    else:
                                        seq_scores = [(seqname, query_id, score) for seqname, query_id, score in assembly_scores
                                                      if species_name in seqname]
                                        if seq_scores and score > seq_scores[0][2]:
                                            assembly_scores.remove(seq_scores[0])
                                            assembly_scores.append((seqname, query_id, score))
                                            if taxons:
                                                taxons_for_krona[correct_name(seqname)] = taxons
                                            refs_for_query += 1
                                else:
                                    if seqname not in replacement_dict[query_id]:
                                        replacement_dict[query_id].append(seqname)
                                        refs_for_query += 1
        assembly_scores = sorted(assembly_scores, reverse=True)
        assembly_scores = assembly_scores[:qconfig.max_references]
        for seqname, query_id, score in assembly_scores:
            if not species_by_assembly or not any(seqname in species_list for species_list in species_by_assembly.values()):
                species_scores.append((seqname, query_id, score))
        species_by_assembly[label] = [seqname for seqname, query_id, score in assembly_scores]
    if not species_scores:
        return None, None
    return species_scores, species_by_assembly, replacement_dict
def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths,
                 blast_check_fpath, err_fpath, organisms_assemblies=None):
    ref_fpaths = []
    downloaded_organisms = []

    total_downloaded = 0
    total_scored_left = len(organisms)
    if total_scored_left == 0:
        if not qconfig.debug and os.path.exists(err_fpath):
            os.remove(err_fpath)
        return ref_fpaths

    max_organism_name_len = 0
    for organism in organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)
    for organism in downloaded_organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)

    logger.print_timestamp()
    logger.main_info('Trying to download found references from NCBI. '
                'Totally ' + str(total_scored_left) + ' organisms to try.')
    if len(downloaded_ref_fpaths) > 0:
        logger.main_info('MetaQUAST will attempt to use previously downloaded references...')

    for organism in organisms:
        ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta')
        spaces = (max_organism_name_len - len(organism)) * ' '
        new_ref_fpath = None
        was_downloaded = False
        if not os.path.exists(ref_fpath) and organism not in not_founded_organisms:
            new_ref_fpath = download_refs(organism, ref_fpath)
        elif os.path.exists(ref_fpath):
            was_downloaded = True
            new_ref_fpath = ref_fpath
        if new_ref_fpath:
            total_scored_left -= 1
            total_downloaded += 1
            if was_downloaded:
                logger.main_info("  %s%s | was downloaded previously (total %d, %d more to go)" %
                            (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
                if new_ref_fpath not in ref_fpaths:
                    ref_fpaths.append(new_ref_fpath)
            else:
                logger.main_info("  %s%s | successfully downloaded (total %d, %d more to go)" %
                        (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
                ref_fpaths.append(new_ref_fpath)
            downloaded_organisms.append(organism)
        else:
            total_scored_left -= 1
            logger.main_info("  %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces))
            not_founded_organisms.add(organism)
    for assembly, label in zip(assemblies, labels):
        check_fpath = get_blast_output_fpath(blast_check_fpath, label)
        if os.path.exists(check_fpath):
            with open(check_fpath) as check_file:
                text = check_file.read()
                text = text[:text.find('\n')]
        else:
            text = 'Assembly: %s md5 checksum: %s\n' % (assembly.fpath, md5(assembly.fpath))
        with open(check_fpath, 'w') as check_file:
            check_file.writelines(text)
            check_file.writelines('\n---\n')
            cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \
                [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]]
            cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \
                [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]]
            check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms))
            check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms))
    return ref_fpaths
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath):
    if not download_all_blast_binaries():
        return None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error('You should specify path to BLAST database obtained by running makeblastdb command: '
                         'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                         ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                         exit_with_code=2)

    elif not download_blastdb():
        return None, None

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath,
                                                        err_fpath, blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info('')
    scores_organisms = []
    organisms_assemblies = {}
    for label in labels:
        all_scores = []
        organisms = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            with open(res_fpath) as res_file:
                for line in res_file:
                    if refs_for_query == 0 and not line.startswith('#') and len(line.split()) > 10:
                        # TODO: find and parse "Fields" line to detect each column indexes:
                        # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                        # We need: identity, legnth, score, query and subject id.
                        line = line.split()
                        organism_id = line[1]
                        idy = float(line[2])
                        length = int(line[3])
                        score = float(line[11])
                        if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                            seqname, taxons = parse_organism_id(organism_id)
                            if not seqname:
                                continue
                            specie = seqname.split('_')
                            if len(specie) > 1 and 'uncultured' not in seqname:
                                specie = specie[0] + '_' + specie[1]
                                if specie not in organisms:
                                    all_scores.append((score, seqname))
                                    if taxons:
                                        taxons_for_krona[correct_name(seqname)] = taxons
                                    organisms.append(specie)
                                    refs_for_query += 1
                                else:
                                    tuple_scores = [x for x in all_scores if specie in x[1]]
                                    if tuple_scores and score > tuple_scores[0][0]:
                                        all_scores.remove((tuple_scores[0][0], tuple_scores[0][1]))
                                        all_scores.append((score, seqname))
                                        if taxons:
                                            taxons_for_krona[correct_name(seqname)] = taxons
                                        refs_for_query += 1
                    elif line.startswith('#'):
                        refs_for_query = 0
        all_scores = sorted(all_scores, reverse=True)
        all_scores = all_scores[:qconfig.max_references]
        for score in all_scores:
            if not organisms_assemblies or (organisms_assemblies.values() and not [1 for list in organisms_assemblies.values() if score[1] in list]):
                scores_organisms.append(score)
        organisms_assemblies[label] = [score[1] for score in all_scores]
    if not scores_organisms:
        return None, None
    return scores_organisms, organisms_assemblies
Ejemplo n.º 17
0
def get_corr_name(name):
    return qutils.correct_name(name)
Ejemplo n.º 18
0
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath,
                  labels, blast_check_fpath, err_fpath):
    if not download_blast_binaries(filenames=blast_filenames):
        return None, None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [
                f for f in os.listdir(db_fpath) if f.endswith('.nsq')
            ]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath,
                                db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error(
                'You should specify path to BLAST database obtained by running makeblastdb command: '
                'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                exit_with_code=2)

    elif not download_blastdb():
        return None, None, None

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib2 import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(
            assembly.fpath, assembly.label, corrected_dirpath, err_fpath,
            blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info()
    species_scores = []
    species_by_assembly = dict()
    max_entries = 4
    replacement_dict = defaultdict(list)
    for label in labels:
        assembly_scores = []
        assembly_species = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            with open(res_fpath) as res_file:
                query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None
                for line in res_file:
                    fs = line.split()
                    if line.startswith('#'):
                        refs_for_query = 0
                        # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                        if 'Fields' in line:
                            fs = line.strip().split('Fields: ')[-1].split(', ')
                            query_id_col = fs.index(
                                'query id') if 'query id' in fs else 0
                            subj_id_col = fs.index(
                                'subject id') if 'subject id' in fs else 1
                            idy_col = fs.index(
                                '% identity') if '% identity' in fs else 2
                            len_col = fs.index(
                                'alignment length'
                            ) if 'alignment length' in fs else 3
                            score_col = fs.index(
                                'bit score') if 'bit score' in fs else 11
                    elif refs_for_query < max_entries and len(fs) > score_col:
                        query_id = fs[query_id_col]
                        organism_id = fs[subj_id_col]
                        idy = float(fs[idy_col])
                        length = int(fs[len_col])
                        score = float(fs[score_col])
                        if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                            seqname, taxons = parse_organism_id(organism_id)
                            if not seqname:
                                continue
                            species_name = get_species_name(seqname)
                            if species_name and 'uncultured' not in seqname:
                                if refs_for_query == 0:
                                    if species_name not in assembly_species:
                                        assembly_scores.append(
                                            (seqname, query_id, score))
                                        if taxons:
                                            taxons_for_krona[correct_name(
                                                seqname)] = taxons
                                        assembly_species.append(species_name)
                                        refs_for_query += 1
                                    else:
                                        seq_scores = [
                                            (query_name, seq_query_id,
                                             seq_score)
                                            for query_name, seq_query_id,
                                            seq_score in assembly_scores
                                            if get_species_name(
                                                query_name) == species_name
                                        ]
                                        if seq_scores and score > seq_scores[
                                                0][2]:
                                            assembly_scores.remove(
                                                seq_scores[0])
                                            assembly_scores.append(
                                                (seqname, query_id, score))
                                            if taxons:
                                                taxons_for_krona[correct_name(
                                                    seqname)] = taxons
                                            refs_for_query += 1
                                else:
                                    if seqname not in replacement_dict[
                                            query_id]:
                                        replacement_dict[query_id].append(
                                            seqname)
                                        refs_for_query += 1
        assembly_scores = sorted(assembly_scores, reverse=True)
        assembly_scores = assembly_scores[:qconfig.max_references]
        for seqname, query_id, score in assembly_scores:
            if not species_by_assembly or not any(
                    seqname in species_list
                    for species_list in species_by_assembly.values()):
                species_scores.append((seqname, query_id, score))
        species_by_assembly[label] = [
            seqname for seqname, query_id, score in assembly_scores
        ]
    if not species_scores:
        return None, None, None
    return species_scores, species_by_assembly, replacement_dict
Ejemplo n.º 19
0
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath):
    if not os.path.isdir(blastdb_dirpath):
        os.makedirs(blastdb_dirpath)

    if not download_all_blast_binaries():
        return None, None

    if qconfig.custom_blast_db_fpath:
        global db_fpath
        db_fpath = qconfig.custom_blast_db_fpath
        if isdir(db_fpath):
            db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')]
            if db_aux_files:
                db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', ''))
        elif isfile(db_fpath) and db_fpath.endswith('.nsq'):
            db_fpath = db_fpath[:-len('.nsq')]
        if not os.path.isfile(db_fpath + '.nsq'):
            logger.error('You should specify path to BLAST database obtained by running makeblastdb command: '
                         'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.'
                         ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.',
                         exit_with_code=2)

    elif not os.path.isfile(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize:
        # if os.path.isdir(blastdb_dirpath):
        #     shutil.rmtree(blastdb_dirpath)
        if not download_blastdb():
            return None, None
        logger.info()

    blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res')

    if len(blast_assemblies) > 0:
        logger.main_info('Running BlastN..')
        n_jobs = min(qconfig.max_threads, len(blast_assemblies))
        blast_threads = max(1, qconfig.max_threads // n_jobs)
        if is_python2():
            from joblib import Parallel, delayed
        else:
            from joblib3 import Parallel, delayed
        Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath,
                                                        err_fpath, blast_res_fpath, blast_check_fpath, blast_threads)
                                for i, assembly in enumerate(blast_assemblies))

    logger.main_info('')
    scores_organisms = []
    organisms_assemblies = {}
    for label in labels:
        all_scores = []
        organisms = []
        res_fpath = get_blast_output_fpath(blast_res_fpath, label)
        if os.path.exists(res_fpath):
            refs_for_query = 0
            for line in open(res_fpath):
                if refs_for_query == 0 and not line.startswith('#') and len(line.split()) > 10:
                    # TODO: find and parse "Fields" line to detect each column indexes:
                    # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
                    # We need: identity, legnth, score, query and subject id.
                    line = line.split()
                    organism_id = line[1]
                    idy = float(line[2])
                    length = int(line[3])
                    score = float(line[11])
                    if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore:  # and (not scores or min(scores) - score < max_identity_difference):
                        seqname, taxons = parse_organism_id(organism_id)
                        if not seqname:
                            continue
                        specie = seqname.split('_')
                        if len(specie) > 1 and 'uncultured' not in seqname:
                            specie = specie[0] + '_' + specie[1]
                            if specie not in organisms:
                                all_scores.append((score, seqname))
                                if taxons:
                                    taxons_for_krona[correct_name(seqname)] = taxons
                                organisms.append(specie)
                                refs_for_query += 1
                            else:
                                tuple_scores = [x for x in all_scores if specie in x[1]]
                                if tuple_scores and score > tuple_scores[0][0]:
                                    all_scores.remove((tuple_scores[0][0], tuple_scores[0][1]))
                                    all_scores.append((score, seqname))
                                    if taxons:
                                        taxons_for_krona[correct_name(seqname)] = taxons
                                    refs_for_query += 1
                elif line.startswith('#'):
                    refs_for_query = 0
        all_scores = sorted(all_scores, reverse=True)
        all_scores = all_scores[:qconfig.max_references]
        for score in all_scores:
            if not organisms_assemblies or (organisms_assemblies.values() and not [1 for list in organisms_assemblies.values() if score[1] in list]):
                scores_organisms.append(score)
        organisms_assemblies[label] = [score[1] for score in all_scores]
    if not scores_organisms:
        return None, None
    return scores_organisms, organisms_assemblies
Ejemplo n.º 20
0
def parse_ncbi(ncbi_file):
    annotation_pattern = re.compile(
        r'Annotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)',
        re.I)
    chromosome_pattern = re.compile(r'Chromosome: (?P<chromosome>\S+);', re.I)
    id_pattern = re.compile(r'ID: (?P<id>\d+)', re.I)

    genes = []

    line = ncbi_file.readline()
    while line != '':
        while line.rstrip() == '' or line.startswith('##'):
            if line == '':
                break
            line = ncbi_file.readline()

        m = ncbi_start_pattern.match(line.rstrip())
        while not m:
            m = ncbi_start_pattern.match(line.rstrip())

        gene = Gene(number=int(m.group('number')),
                    name=qutils.correct_name(m.group('name')))

        the_rest_lines = []

        line = ncbi_file.readline()
        while line != '' and not ncbi_start_pattern.match(line.rstrip()):
            the_rest_lines.append(line.rstrip())
            line = ncbi_file.readline()

        for info_line in the_rest_lines:
            if info_line.startswith('Chromosome:'):
                m = re.match(chromosome_pattern, info_line)
                if m:
                    gene.chromosome = m.group('chromosome')

            if info_line.startswith('Annotation:'):
                m = re.match(annotation_pattern, info_line)
                if m:
                    gene.seqname = m.group('seqname')
                    gene.start = int(m.group('start'))
                    gene.end = int(m.group('end'))

                    to_trim = 'Chromosome' + ' ' + str(gene.chromosome)
                    if gene.chromosome and gene.seqname.startswith(to_trim):
                        gene.seqname = gene.seqname[len(to_trim):]
                        gene.seqname.lstrip(' ,')

                else:
                    logger.warning('Wrong NCBI annotation for gene ' +
                                   str(gene.number) + '. ' + gene.name +
                                   '. Skipping this gene.')

            if info_line.startswith('ID:'):
                m = re.match(id_pattern, info_line)
                if m:
                    gene.id = m.group('id')
                else:
                    logger.warning(
                        'Can\'t parse gene\'s ID in NCBI format. Gene is ' +
                        str(gene.number) + '. ' + gene.name + '. Skipping it.')

        if gene.start is not None and gene.end is not None:
            genes.append(gene)
        # raise ParseException('NCBI format parsing error: provide start and end for gene ' + gene.number + '. ' + gene.name + '.')
    return genes
Ejemplo n.º 21
0
def process_refs(organisms, assemblies, labels, downloaded_dirpath, not_founded_organisms, downloaded_ref_fpaths,
                 blast_check_fpath, err_fpath, organisms_assemblies=None):
    ref_fpaths = []
    downloaded_organisms = []

    total_downloaded = 0
    total_scored_left = len(organisms)
    if total_scored_left == 0:
        if not qconfig.debug and os.path.exists(err_fpath):
            os.remove(err_fpath)
        return ref_fpaths

    max_organism_name_len = 0
    for organism in organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)
    for organism in downloaded_organisms:
        max_organism_name_len = max(len(organism), max_organism_name_len)

    logger.print_timestamp()
    logger.main_info('Trying to download found references from NCBI. '
                'Totally ' + str(total_scored_left) + ' organisms to try.')
    if len(downloaded_ref_fpaths) > 0:
        logger.main_info('MetaQUAST will attempt to use previously downloaded references...')

    for organism in organisms:
        ref_fpath = os.path.join(downloaded_dirpath, correct_name(organism) + '.fasta')
        spaces = (max_organism_name_len - len(organism)) * ' '
        new_ref_fpath = None
        was_downloaded = False
        if not os.path.exists(ref_fpath) and organism not in not_founded_organisms:
            new_ref_fpath = download_refs(organism, ref_fpath)
        elif os.path.exists(ref_fpath):
            was_downloaded = True
            new_ref_fpath = ref_fpath
        if new_ref_fpath:
            total_scored_left -= 1
            total_downloaded += 1
            if was_downloaded:
                logger.main_info("  %s%s | was downloaded previously (total %d, %d more to go)" %
                            (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
                if new_ref_fpath not in ref_fpaths:
                    ref_fpaths.append(new_ref_fpath)
            else:
                logger.main_info("  %s%s | successfully downloaded (total %d, %d more to go)" %
                        (organism.replace('+', ' '), spaces, total_downloaded, total_scored_left))
                ref_fpaths.append(new_ref_fpath)
            downloaded_organisms.append(organism)
        else:
            total_scored_left -= 1
            logger.main_info("  %s%s | not found in the NCBI database" % (organism.replace('+', ' '), spaces))
            not_founded_organisms.add(organism)
    for assembly, label in zip(assemblies, labels):
        check_fpath = get_blast_output_fpath(blast_check_fpath, label)
        if os.path.exists(check_fpath):
            with open(check_fpath) as check_file:
                text = check_file.read()
                text = text[:text.find('\n')]
        else:
            text = 'Assembly: %s size: %d\n' % (assembly.fpath, os.path.getsize(assembly.fpath))
        with open(check_fpath, 'w') as check_file:
            check_file.writelines(text)
            check_file.writelines('\n---\n')
            cur_downloaded_organisms = [organism for organism in downloaded_organisms] if not organisms_assemblies else \
                [organism for organism in downloaded_organisms if organism in organisms_assemblies[label]]
            cur_not_founded_organisms = [organism for organism in not_founded_organisms] if not organisms_assemblies else \
                [organism for organism in not_founded_organisms if organism in organisms_assemblies[label]]
            check_file.writelines('Downloaded: %s\n' % ','.join(cur_downloaded_organisms))
            check_file.writelines('Not_founded: %s\n' % ','.join(cur_not_founded_organisms))
    return ref_fpaths
Ejemplo n.º 22
0
def get_correct_names_for_chroms(output_dirpath,
                                 fasta_fpath,
                                 sam_fpath,
                                 err_path,
                                 reads_fpaths,
                                 logger,
                                 is_reference=False):
    correct_chr_names = dict()
    fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath)
    sam_chr_lengths = OrderedDict()
    sam_header_fpath = join(dirname(output_dirpath),
                            basename(sam_fpath) + '.header')
    if not isfile(sam_fpath) and not isfile(sam_header_fpath):
        return None
    if isfile(sam_fpath):
        qutils.call_subprocess(
            [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
            stdout=open(sam_header_fpath, 'w'),
            stderr=open(err_path, 'a'),
            logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(fasta_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(),
                                      sam_chr_lengths.keys()):
            if correct_name(
                    sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[
                        sam_chr] == fasta_chr_lengths[fasta_chr]:
                correct_chr_names[sam_chr] = fasta_chr
            elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(
                inconsistency + ' in ' + fasta_fpath +
                ' and corresponding SAM file ' + sam_fpath +
                ' do not match. ' + 'QUAST will try to realign reads to ' +
                ('the reference genome' if is_reference else fasta_fpath))
        else:
            logger.error(
                inconsistency + ' in ' + fasta_fpath +
                ' and corresponding SAM file ' + sam_fpath +
                ' do not match. ' +
                'Use SAM file obtained by aligning reads to ' +
                ('the reference genome' if is_reference else fasta_fpath))
        return None
    return correct_chr_names
Ejemplo n.º 23
0
def correct_meta_references(ref_fpaths, corrected_dirpath):
    corrected_ref_fpaths = []

    combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name)

    chromosomes_by_refs = {}

    def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name
        if not qconfig.no_check:
            corr_seq = correct_seq(seq, ref_fpath)
            if not corr_seq:
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath

    ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths]
    ref_names = []
    for ref_fname in ref_fnames:
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        ref_names.append(ref_name)
    dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1]

    for ref_fpath in ref_fpaths:
        total_references = 0
        ref_fname = os.path.basename(ref_fpath)
        ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname)
        if ref_name in dupl_ref_names:
            ref_name = qutils.get_label_from_par_dir_and_fname(ref_fpath)

        chromosomes_by_refs[ref_name] = []
        used_seq_names = defaultdict(int)

        corr_seq_fpath = None
        for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)):
            total_references += 1
            seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1)
            uniq_seq_name = get_uniq_name(seq_name, used_seq_names)
            used_seq_names[seq_name] += 1
            corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath)
            if not corr_seq_name:
                break
        if corr_seq_fpath:
            logger.main_info('  ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '')

    logger.main_info('  All references combined in ' + qconfig.combined_ref_name)

    return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths