Exemple #1
0
def find_recombination(genes, alignment, output=sys.stdout):
    """Counts the number of versions of each gene.

    :param genes: List of genes (output of :class:`~pymlst.wg.extractors.TableExtractor`
                  using ``export='gene'``).
    :param alignment: `fasta`_ file alignment
                      (output of :class:`~pymlst.wg.extractors.SequenceExtractor` using ``align=True``).
    :param output: The output where to write the results.
    """
    genes = [line.rstrip("\n") for line in genes]
    logging.info("Number of genes to look at : %s", len(genes))

    sequences = [[] for _ in genes]
    samples = []

    # load sequences by gene
    indice = 0
    for line in alignment:
        line = line.rstrip("\n")

        # header
        if line.startswith(">"):
            indice = 0
            samples.append(line.lstrip(">"))
            continue

        # check genes number correct
        if indice >= len(genes):
            raise exceptions.PyMLSTError(
                'The genes list doesn\'t correspond to the alignment {}'.
                format(indice))

        # genes
        sequences[indice].append(line)
        indice += 1

    # check sequences are correctly align
    for i, seqs in enumerate(sequences):
        if len({len(s) for s in seqs}) > 1:
            logging.error({len(s) for s in seqs})
            raise exceptions.PyMLSTError(
                'The following genes are not aligned: {}'.format(genes[i]))

    output.write("Gene\tMutation\tLenght\tmutation per 100 base\n")
    for i, seqs in enumerate(sequences):
        compared = utils.compar_seqs(seqs)
        output.write(genes[i] + "\t" + str(compared) + "\t" + str(len(seqs[0])) + \
                     "\t" + str(compared/len(seqs[0])*100) + "\n")
Exemple #2
0
    def extract(self, base, output):
        coregene = read_gene_list(base, self.list_file)
        if len(coregene) == 0:
            raise exceptions.PyMLSTError(
                'No valid genes selected, verify your genes list')
        strains = base.get_all_strains()
        duplicated = base.get_duplicated_genes()

        sequences = {s: [] for s in strains}
        for index, gene in enumerate(coregene):
            if gene in duplicated:
                logging.info("%s/%s | %s     %s", index + 1, len(coregene),
                             gene, "No: Repeat gene")
                continue
            seqs = base.get_gene_sequences(gene)
            size = set()
            for seq in seqs:
                size.add(len(seq[2]))
            if len(size) == 1 and self.realign is False:
                self.add_sequence_strain(seqs, strains, sequences)
                logging.info("%s/%s | %s     %s", index + 1, len(coregene),
                             gene, "Direct")
            else:
                genes = {str(s[0]): s[2] for s in seqs}
                corrseqs = mafft.align(genes)
                for seq in seqs:
                    seq[2] = corrseqs.get(str(seq[0]))
                self.add_sequence_strain(seqs, strains, sequences)
                logging.info("%s/%s | %s     %s", index + 1, len(coregene),
                             gene, "Align")

        # output align result
        for strain in strains:
            output.write('>' + strain + "\n")
            output.write("\n".join(map(str, sequences.get(strain))) + "\n")
Exemple #3
0
def run_kma(fastq, basename, identity, coverage, reads):
    """Run kma on fastq(s) and return sequences"""
    if is_database_indexing(basename) is False:
        raise exceptions.PyMLSTError('Dabatase must be index with KMA')
    
    path = config.get_binary_path('kma')
    if path is None:
        raise exceptions.BinaryNotFound('KMA binary was not found')

    with tempfile.NamedTemporaryFile('w+t') as tmp:
        baseout = tmp.name
    command = [path, '-t_db', basename+suffix, '-o', baseout, '-nf']
    if len(fastq) == 1:
        command.extend(['-i', fastq[0].name])
    elif len(fastq) == 2:
        command.extend(['-ipe', fastq[0].name, fastq[1].name])
    else:
        raise exceptions.PyMLSTError('Too many fastq files in input of run_kma')

    logging.info("Running KMA with cg/wgMLST database")
    proc = subprocess.Popen(command, stderr=subprocess.PIPE, \
                            stdout=subprocess.PIPE)

    output, error = proc.communicate()
    if os.path.exists(baseout + ".res") and os.path.exists(baseout + ".fsa"):
        for line in BytesIO(error).readlines():
            logging.debug(line.decode().rstrip())
    else:
        for line in BytesIO(error).readlines():
            logging.error(line.decode().rstrip())
        raise exceptions.PyMLSTError(
            'An error occurred while running KMA')   

    with open(baseout + ".res", 'r') as kma:
        kma_res = read_kma_res(kma, coverage, identity, reads)
    seqs = utils.read_genome(baseout + ".fsa")

    del_kma_tmp(baseout)
    if len(kma_res) == 0:
        raise exceptions.CoreGenomePathNotFound(
            'No path was found for the core genome')
    return kma_res,seqs
Exemple #4
0
    def get_valid_shema(self, base):
        # read samples mlst
        strains = base.get_all_strains()
        # Minimun number of strain
        if self.mincover < 0 or self.mincover > len(strains):
            raise exceptions.PyMLSTError(
                'Mincover must be between 0 and number of strains {}'.format(
                    len(strains)))

        # allgene
        allgene = base.get_core_genes()
        # duplicate gene
        dupli = base.get_duplicated_genes()
        # cover without duplication
        count_souches = base.count_souches_per_gene()
        # Count distinct gene
        diff = base.count_sequences_per_gene()

        # filter coregene that is not sufficient mincover or keep only different or return inverse
        valid_shema = []
        # Test different case for validation
        for gene in allgene:
            valid = []
            if self.keep is True:
                if diff.get(gene, 0) > 1:
                    valid.append(True)
                else:
                    valid.append(False)
            else:
                valid.append(True)
            if count_souches.get(gene, 0) >= self.mincover:
                valid.append(True)
            else:
                valid.append(False)
            if not self.duplicate:
                if gene in dupli:
                    valid.append(False)
                else:
                    valid.append(True)
            else:
                valid.append(True)
            if self.inverse is False:
                if sum(valid) == 3:
                    valid_shema.append(gene)
            else:
                if sum(valid) < 3:
                    valid_shema.append(gene)

        # report
        logging.info("Number of coregene used : %s/%s", len(valid_shema),
                     len(allgene))
        return (valid_shema)
Exemple #5
0
def cli(force, prompt, database, species):
    """Create a wgMLST DATABASE from an online resource.

    The research can be filtered by adding a SPECIES name."""

    utils.create_logger()

    try:

        if os.path.exists(database):
            if force:
                open(database, "w").close()
            else:
                raise exceptions.PyMLSTError(
                    "Database alreadly exists, use --force to override it")

        url = web.retrieve_cgmlst(' '.join(species), prompt)

        if url is None:
            logging.info('No choice selected')
            return

        logging.info('Downloading the core genome...')

        with tempfile.NamedTemporaryFile('w+', delete=False) as tmp:

            skipped = web.get_cgmlst_file(url, tmp)
            tmp.close()
            if len(skipped) > 0:
                logging.info('Skipped the following malformed file(s): %s',
                             ', '.join(skipped))

            with pymlst.open_wg(os.path.abspath(database)) as mlst:
                mlst.create(tmp.name)

    except requests.exceptions.HTTPError:
        raise click.ClickException('Could not retrieve online data')
    except requests.exceptions.ConnectionError:
        raise click.ClickException(
            'Could not access to the server, please verify your internet connection'
        )
    except requests.exceptions.Timeout:
        raise click.ClickException('The server took too long to respond')
    except web.StructureError:
        raise click.ClickException(
            'It seems like the structure of the website/API changed '
            'since this application was developed.')
    except exceptions.PyMLSTError as err:
        raise click.ClickException(str(err))
Exemple #6
0
 def add_sequence_strain(self, seqs, strains, sequences):
     """Add a sequence to multi-align, take the first gene in case of repetition"""
     size = 0
     if len(seqs) > 0:
         size = len(seqs[0][2])
     for strain in strains:
         seq = [i[2] for i in seqs if strain in i[1]]
         if len(seq) == 0:
             sequences.get(strain).append('-' * size)
         elif len(seq) == 1:
             sequences.get(strain).append(seq[0])
         else:
             raise exceptions.PyMLSTError(
                 'Repeated genes must be excluded in order to export alignment'
             )
Exemple #7
0
def cli(force, prompt, mlst, database, species):
    """Create a claMLST DATABASE from an online resource.

    The research can be filtered by adding a SPECIES name."""

    utils.create_logger()

    try:

        if os.path.exists(database):
            if force:
                open(database, "w").close()
            else:
                raise exceptions.PyMLSTError(
                    "Database alreadly exists, use --force to override it")

        url = web.retrieve_mlst(' '.join(species), prompt, mlst)

        if url is None:
            logging.info('No choice selected')
            return

        logging.info('Downloading mlst...')

        with tempfile.TemporaryDirectory() as tmp_dir, \
                pymlst.open_cla(os.path.abspath(database)) as mlst_db:

            web.get_mlst_files(url, tmp_dir)

            mlst_db.create(open(tmp_dir + '/profiles.csv', 'rt'), [
                open(tmp_dir + '/locus/' + locus, 'r')
                for locus in os.listdir(tmp_dir + '/locus')
            ])

    except requests.exceptions.HTTPError:
        raise click.ClickException('Could not retrieve online data')
    except requests.exceptions.ConnectionError:
        raise click.ClickException(
            'Could not access to the server, please verify your internet connection'
        )
    except requests.exceptions.Timeout:
        raise click.ClickException('The server took too long to respond')
    except web.StructureError:
        raise click.ClickException(
            'It seems like the structure of the website/API changed '
            'since this application was developed.')
    except exceptions.PyMLSTError as err:
        raise click.ClickException(str(err))
Exemple #8
0
def cli(force, database, scheme, alleles):
    """Create a classical MLST DATABASE from a SCHEME csv and ALLELES files."""

    try:

        if os.path.exists(database):
            if force:
                open(database, "w").close()
            else:
                raise exceptions.PyMLSTError(
                    "Database alreadly exists, use --force to override it")

        with pymlst.open_cla(os.path.abspath(database)) as mlst:
            mlst.create(scheme, alleles)

    except exceptions.PyMLSTError as err:
        raise click.ClickException(str(err))
Exemple #9
0
 def multi_read(self,
                fastqs,
                identity=0.90,
                coverage=0.95,
                reads=10,
                paired=True,
                fasta=None,
                output=sys.stdout):
     """Search the **Sequence Type** number of one or multi strain(s) from raw reads.
         
     :param fastqs: Tuple of one or multiple strain raw reads given as input
     :param output: An output for the sequence type research results.
     :param identity: Sets the minimum identity used by `KMA`_
                      for sequences research (in percent).
     :param reads: Sets the minimum reads coverage to conserve an mapping
     :param paired: Defined if the raxw reads are by paired or single
     :param fasta: A file where to export genes alleles results in a fasta format.
     :param coverage: Sets the minimum accepted coverage for found sequences.
     
     """
     header = True
     if paired:
         if len(fastqs) % 2 != 0:
             raise exceptions.PyMLSTError(
                 "Fastq paired files are not a multiple of 2")
         for fastq in zip(fastqs[::2], fastqs[1::2]):
             logging.info("Search ST from files: %s - %s", os.path.basename(fastq[0].name), \
                          os.path.basename(fastq[1].name))
             res = self.search_read(fastq, identity, coverage, reads, fasta)
             res.write(output, header)
             if header:
                 header = False
             logging.info("FINISH")
     else:
         for fastq in fastqs:
             logging.info("Search ST from files: %s",
                          os.path.basename(fastq.name))
             res = self.search_read([fastq], identity, coverage, reads,
                                    fasta)
             res.write(output, header)
             if header:
                 header = False
             logging.info("FINISH")
Exemple #10
0
def cli(database, force, **kwargs):
    """Create a wgMLST DATABASE from a template COREGENE."""

    try:

        if os.path.exists(database):
            if force:
                open(database, "w").close()
            else:
                raise exceptions.PyMLSTError(
                    "Database alreadly exists, use --force to override it")

        with pymlst.open_wg(os.path.abspath(database)) as mlst:
            mlst.create(**utils.clean_kwargs(kwargs))

    except exceptions.DuplicatedGeneSequence as err:
        raise click.UsageError('{}, use -c or -r options to manage it'.format(
            str(err)))
    except exceptions.PyMLSTError as err:
        raise click.UsageError(str(err))
Exemple #11
0
def run_blat(genome, tmpfile, tmpout, identity, coverage):
    """Run Blat and return Psl Object"""
    path = config.get_binary_path('blat')
    if path is None:
        raise exceptions.BinaryNotFound('BLAT binary was not found')

    command = [
        path, '-maxIntron=20', '-fine', '-minIdentity=' + str(identity * 100),
        genome.name, tmpfile.name, tmpout.name
    ]
    proc = subprocess.Popen(command,
                            stderr=subprocess.PIPE,
                            stdout=subprocess.PIPE)

    output, error = proc.communicate()
    for line in BytesIO(output).readlines():
        logging.debug(line.decode().rstrip())
    have_error = False
    for line in BytesIO(error).readlines():
        have_error = True
        logging.error(line.decode().rstrip())
    if have_error:
        raise exceptions.PyMLSTError('An error occurred while running BLAT')
    genes = {}
    for line in open(tmpout.name, 'r'):
        try:
            int(line.split()[0])
        except (ValueError, IndexError):
            continue
        psl = Psl(line)
        if coverage <= psl.coverage <= 1:
            genes.setdefault(psl.gene_id(), []).append(psl)
    if len(genes) == 0:
        raise exceptions.CoreGenomePathNotFound(
            'No path was found for the core genome')
    return genes
Exemple #12
0
def index_database(basename, coregenes):
    """Index a database with kma if the base is not already indexing
    
    :coregene is a temporary file containing coregenes sequences
    """
    if is_database_indexing(basename) is False:
        path = config.get_binary_path('kma')
        if path is None:
            raise exceptions.BinaryNotFound('KMA binary was not found')
        logging.info("Indexing database %s with kma", \
                     os.path.basename(basename))
        
        command = [path, 'index', '-i', coregenes.name, '-o', basename + suffix]
        proc = subprocess.Popen(command, stderr=subprocess.PIPE, \
                                stdout=subprocess.PIPE)
        output, error = proc.communicate()
        if is_database_indexing(basename) is False:
            for line in BytesIO(error).readlines():
                logging.error(line.decode().rstrip())
            raise exceptions.PyMLSTError(
                'An error occurred while indexing KMA')
        else:
            for line in BytesIO(error).readlines():
                logging.debug(line.decode().rstrip())
Exemple #13
0
def find_subgraph(distance, threshold=50, output=sys.stdout, export='list'):
    """Searches groups of strains separated by a distance threshold.

    :param threshold: Minimum distance to maintain for groups extraction.
    :param distance: Distance matrix file
                     (output of :class:`~pymlst.wg.extractors.TableExtractor`
                     with ``export='distance'``).
    :param output: The output where to write the results.
    :param export: Sets the export type.
    """
    samps = []
    dists = []
    try:
        strains = int(distance.readline().rstrip("\n"))
    except Exception as err:
        raise exceptions.PyMLSTError(
            "The distance file seems not correctly "
            "formatted, not integer on first line") from err

    for line in distance.readlines():
        dist_line = line.rstrip("\n").split("\t")
        samps.append(dist_line[0])
        dists.append(dist_line[1:])

    if len(samps) != strains:
        raise exceptions.PyMLSTError(
            "The distance is not properly formatted, "
            "the number of strains ({}) doesn't correspond to {}".format(
                len(samps), strains))

    # create graph
    graph = nx.Graph()
    graph.add_nodes_from(samps)

    for strain_index, _ in enumerate(samps):
        for dist_index, dist in enumerate(dists[strain_index]):
            dist = int(dist)
            if strain_index == dist_index or dist > threshold:
                continue
            graph.add_edge(samps[strain_index], samps[dist_index], weight=dist)

    # extract interconnected subgraph
    # count sample not found
    samps2 = set(samps)
    grps = []
    for sub_graph in [
            graph.subgraph(c) for c in nx.connected_components(graph)
    ]:

        inds = []
        for node in sub_graph.nodes():
            samps2.remove(node)
            inds.append(samps.index(node))
        grps.append(inds)

    grps.sort(key=len, reverse=True)

    # write result
    if export == 'group':
        for i, group in enumerate(grps):
            output.write('Group' + str(i))
            for node in group:
                output.write(" " + samps[node])
            output.write("\n")

    elif export == 'count':
        output.write('Group\t' + '\t'.join(samps) + '\n')
        for i, group in enumerate(grps):
            line = len(samps) * [0]
            for node in group:
                line[node] = 1
            output.write(str(i) + '\t' + '\t'.join(map(str, line)) + '\n')
    else:
        for i, group in enumerate(grps):
            for node in group:
                output.write('Group' + str(i) + '\t' + samps[node] + '\n')