def find_recombination(genes, alignment, output=sys.stdout): """Counts the number of versions of each gene. :param genes: List of genes (output of :class:`~pymlst.wg.extractors.TableExtractor` using ``export='gene'``). :param alignment: `fasta`_ file alignment (output of :class:`~pymlst.wg.extractors.SequenceExtractor` using ``align=True``). :param output: The output where to write the results. """ genes = [line.rstrip("\n") for line in genes] logging.info("Number of genes to look at : %s", len(genes)) sequences = [[] for _ in genes] samples = [] # load sequences by gene indice = 0 for line in alignment: line = line.rstrip("\n") # header if line.startswith(">"): indice = 0 samples.append(line.lstrip(">")) continue # check genes number correct if indice >= len(genes): raise exceptions.PyMLSTError( 'The genes list doesn\'t correspond to the alignment {}'. format(indice)) # genes sequences[indice].append(line) indice += 1 # check sequences are correctly align for i, seqs in enumerate(sequences): if len({len(s) for s in seqs}) > 1: logging.error({len(s) for s in seqs}) raise exceptions.PyMLSTError( 'The following genes are not aligned: {}'.format(genes[i])) output.write("Gene\tMutation\tLenght\tmutation per 100 base\n") for i, seqs in enumerate(sequences): compared = utils.compar_seqs(seqs) output.write(genes[i] + "\t" + str(compared) + "\t" + str(len(seqs[0])) + \ "\t" + str(compared/len(seqs[0])*100) + "\n")
def extract(self, base, output): coregene = read_gene_list(base, self.list_file) if len(coregene) == 0: raise exceptions.PyMLSTError( 'No valid genes selected, verify your genes list') strains = base.get_all_strains() duplicated = base.get_duplicated_genes() sequences = {s: [] for s in strains} for index, gene in enumerate(coregene): if gene in duplicated: logging.info("%s/%s | %s %s", index + 1, len(coregene), gene, "No: Repeat gene") continue seqs = base.get_gene_sequences(gene) size = set() for seq in seqs: size.add(len(seq[2])) if len(size) == 1 and self.realign is False: self.add_sequence_strain(seqs, strains, sequences) logging.info("%s/%s | %s %s", index + 1, len(coregene), gene, "Direct") else: genes = {str(s[0]): s[2] for s in seqs} corrseqs = mafft.align(genes) for seq in seqs: seq[2] = corrseqs.get(str(seq[0])) self.add_sequence_strain(seqs, strains, sequences) logging.info("%s/%s | %s %s", index + 1, len(coregene), gene, "Align") # output align result for strain in strains: output.write('>' + strain + "\n") output.write("\n".join(map(str, sequences.get(strain))) + "\n")
def run_kma(fastq, basename, identity, coverage, reads): """Run kma on fastq(s) and return sequences""" if is_database_indexing(basename) is False: raise exceptions.PyMLSTError('Dabatase must be index with KMA') path = config.get_binary_path('kma') if path is None: raise exceptions.BinaryNotFound('KMA binary was not found') with tempfile.NamedTemporaryFile('w+t') as tmp: baseout = tmp.name command = [path, '-t_db', basename+suffix, '-o', baseout, '-nf'] if len(fastq) == 1: command.extend(['-i', fastq[0].name]) elif len(fastq) == 2: command.extend(['-ipe', fastq[0].name, fastq[1].name]) else: raise exceptions.PyMLSTError('Too many fastq files in input of run_kma') logging.info("Running KMA with cg/wgMLST database") proc = subprocess.Popen(command, stderr=subprocess.PIPE, \ stdout=subprocess.PIPE) output, error = proc.communicate() if os.path.exists(baseout + ".res") and os.path.exists(baseout + ".fsa"): for line in BytesIO(error).readlines(): logging.debug(line.decode().rstrip()) else: for line in BytesIO(error).readlines(): logging.error(line.decode().rstrip()) raise exceptions.PyMLSTError( 'An error occurred while running KMA') with open(baseout + ".res", 'r') as kma: kma_res = read_kma_res(kma, coverage, identity, reads) seqs = utils.read_genome(baseout + ".fsa") del_kma_tmp(baseout) if len(kma_res) == 0: raise exceptions.CoreGenomePathNotFound( 'No path was found for the core genome') return kma_res,seqs
def get_valid_shema(self, base): # read samples mlst strains = base.get_all_strains() # Minimun number of strain if self.mincover < 0 or self.mincover > len(strains): raise exceptions.PyMLSTError( 'Mincover must be between 0 and number of strains {}'.format( len(strains))) # allgene allgene = base.get_core_genes() # duplicate gene dupli = base.get_duplicated_genes() # cover without duplication count_souches = base.count_souches_per_gene() # Count distinct gene diff = base.count_sequences_per_gene() # filter coregene that is not sufficient mincover or keep only different or return inverse valid_shema = [] # Test different case for validation for gene in allgene: valid = [] if self.keep is True: if diff.get(gene, 0) > 1: valid.append(True) else: valid.append(False) else: valid.append(True) if count_souches.get(gene, 0) >= self.mincover: valid.append(True) else: valid.append(False) if not self.duplicate: if gene in dupli: valid.append(False) else: valid.append(True) else: valid.append(True) if self.inverse is False: if sum(valid) == 3: valid_shema.append(gene) else: if sum(valid) < 3: valid_shema.append(gene) # report logging.info("Number of coregene used : %s/%s", len(valid_shema), len(allgene)) return (valid_shema)
def cli(force, prompt, database, species): """Create a wgMLST DATABASE from an online resource. The research can be filtered by adding a SPECIES name.""" utils.create_logger() try: if os.path.exists(database): if force: open(database, "w").close() else: raise exceptions.PyMLSTError( "Database alreadly exists, use --force to override it") url = web.retrieve_cgmlst(' '.join(species), prompt) if url is None: logging.info('No choice selected') return logging.info('Downloading the core genome...') with tempfile.NamedTemporaryFile('w+', delete=False) as tmp: skipped = web.get_cgmlst_file(url, tmp) tmp.close() if len(skipped) > 0: logging.info('Skipped the following malformed file(s): %s', ', '.join(skipped)) with pymlst.open_wg(os.path.abspath(database)) as mlst: mlst.create(tmp.name) except requests.exceptions.HTTPError: raise click.ClickException('Could not retrieve online data') except requests.exceptions.ConnectionError: raise click.ClickException( 'Could not access to the server, please verify your internet connection' ) except requests.exceptions.Timeout: raise click.ClickException('The server took too long to respond') except web.StructureError: raise click.ClickException( 'It seems like the structure of the website/API changed ' 'since this application was developed.') except exceptions.PyMLSTError as err: raise click.ClickException(str(err))
def add_sequence_strain(self, seqs, strains, sequences): """Add a sequence to multi-align, take the first gene in case of repetition""" size = 0 if len(seqs) > 0: size = len(seqs[0][2]) for strain in strains: seq = [i[2] for i in seqs if strain in i[1]] if len(seq) == 0: sequences.get(strain).append('-' * size) elif len(seq) == 1: sequences.get(strain).append(seq[0]) else: raise exceptions.PyMLSTError( 'Repeated genes must be excluded in order to export alignment' )
def cli(force, prompt, mlst, database, species): """Create a claMLST DATABASE from an online resource. The research can be filtered by adding a SPECIES name.""" utils.create_logger() try: if os.path.exists(database): if force: open(database, "w").close() else: raise exceptions.PyMLSTError( "Database alreadly exists, use --force to override it") url = web.retrieve_mlst(' '.join(species), prompt, mlst) if url is None: logging.info('No choice selected') return logging.info('Downloading mlst...') with tempfile.TemporaryDirectory() as tmp_dir, \ pymlst.open_cla(os.path.abspath(database)) as mlst_db: web.get_mlst_files(url, tmp_dir) mlst_db.create(open(tmp_dir + '/profiles.csv', 'rt'), [ open(tmp_dir + '/locus/' + locus, 'r') for locus in os.listdir(tmp_dir + '/locus') ]) except requests.exceptions.HTTPError: raise click.ClickException('Could not retrieve online data') except requests.exceptions.ConnectionError: raise click.ClickException( 'Could not access to the server, please verify your internet connection' ) except requests.exceptions.Timeout: raise click.ClickException('The server took too long to respond') except web.StructureError: raise click.ClickException( 'It seems like the structure of the website/API changed ' 'since this application was developed.') except exceptions.PyMLSTError as err: raise click.ClickException(str(err))
def cli(force, database, scheme, alleles): """Create a classical MLST DATABASE from a SCHEME csv and ALLELES files.""" try: if os.path.exists(database): if force: open(database, "w").close() else: raise exceptions.PyMLSTError( "Database alreadly exists, use --force to override it") with pymlst.open_cla(os.path.abspath(database)) as mlst: mlst.create(scheme, alleles) except exceptions.PyMLSTError as err: raise click.ClickException(str(err))
def multi_read(self, fastqs, identity=0.90, coverage=0.95, reads=10, paired=True, fasta=None, output=sys.stdout): """Search the **Sequence Type** number of one or multi strain(s) from raw reads. :param fastqs: Tuple of one or multiple strain raw reads given as input :param output: An output for the sequence type research results. :param identity: Sets the minimum identity used by `KMA`_ for sequences research (in percent). :param reads: Sets the minimum reads coverage to conserve an mapping :param paired: Defined if the raxw reads are by paired or single :param fasta: A file where to export genes alleles results in a fasta format. :param coverage: Sets the minimum accepted coverage for found sequences. """ header = True if paired: if len(fastqs) % 2 != 0: raise exceptions.PyMLSTError( "Fastq paired files are not a multiple of 2") for fastq in zip(fastqs[::2], fastqs[1::2]): logging.info("Search ST from files: %s - %s", os.path.basename(fastq[0].name), \ os.path.basename(fastq[1].name)) res = self.search_read(fastq, identity, coverage, reads, fasta) res.write(output, header) if header: header = False logging.info("FINISH") else: for fastq in fastqs: logging.info("Search ST from files: %s", os.path.basename(fastq.name)) res = self.search_read([fastq], identity, coverage, reads, fasta) res.write(output, header) if header: header = False logging.info("FINISH")
def cli(database, force, **kwargs): """Create a wgMLST DATABASE from a template COREGENE.""" try: if os.path.exists(database): if force: open(database, "w").close() else: raise exceptions.PyMLSTError( "Database alreadly exists, use --force to override it") with pymlst.open_wg(os.path.abspath(database)) as mlst: mlst.create(**utils.clean_kwargs(kwargs)) except exceptions.DuplicatedGeneSequence as err: raise click.UsageError('{}, use -c or -r options to manage it'.format( str(err))) except exceptions.PyMLSTError as err: raise click.UsageError(str(err))
def run_blat(genome, tmpfile, tmpout, identity, coverage): """Run Blat and return Psl Object""" path = config.get_binary_path('blat') if path is None: raise exceptions.BinaryNotFound('BLAT binary was not found') command = [ path, '-maxIntron=20', '-fine', '-minIdentity=' + str(identity * 100), genome.name, tmpfile.name, tmpout.name ] proc = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE) output, error = proc.communicate() for line in BytesIO(output).readlines(): logging.debug(line.decode().rstrip()) have_error = False for line in BytesIO(error).readlines(): have_error = True logging.error(line.decode().rstrip()) if have_error: raise exceptions.PyMLSTError('An error occurred while running BLAT') genes = {} for line in open(tmpout.name, 'r'): try: int(line.split()[0]) except (ValueError, IndexError): continue psl = Psl(line) if coverage <= psl.coverage <= 1: genes.setdefault(psl.gene_id(), []).append(psl) if len(genes) == 0: raise exceptions.CoreGenomePathNotFound( 'No path was found for the core genome') return genes
def index_database(basename, coregenes): """Index a database with kma if the base is not already indexing :coregene is a temporary file containing coregenes sequences """ if is_database_indexing(basename) is False: path = config.get_binary_path('kma') if path is None: raise exceptions.BinaryNotFound('KMA binary was not found') logging.info("Indexing database %s with kma", \ os.path.basename(basename)) command = [path, 'index', '-i', coregenes.name, '-o', basename + suffix] proc = subprocess.Popen(command, stderr=subprocess.PIPE, \ stdout=subprocess.PIPE) output, error = proc.communicate() if is_database_indexing(basename) is False: for line in BytesIO(error).readlines(): logging.error(line.decode().rstrip()) raise exceptions.PyMLSTError( 'An error occurred while indexing KMA') else: for line in BytesIO(error).readlines(): logging.debug(line.decode().rstrip())
def find_subgraph(distance, threshold=50, output=sys.stdout, export='list'): """Searches groups of strains separated by a distance threshold. :param threshold: Minimum distance to maintain for groups extraction. :param distance: Distance matrix file (output of :class:`~pymlst.wg.extractors.TableExtractor` with ``export='distance'``). :param output: The output where to write the results. :param export: Sets the export type. """ samps = [] dists = [] try: strains = int(distance.readline().rstrip("\n")) except Exception as err: raise exceptions.PyMLSTError( "The distance file seems not correctly " "formatted, not integer on first line") from err for line in distance.readlines(): dist_line = line.rstrip("\n").split("\t") samps.append(dist_line[0]) dists.append(dist_line[1:]) if len(samps) != strains: raise exceptions.PyMLSTError( "The distance is not properly formatted, " "the number of strains ({}) doesn't correspond to {}".format( len(samps), strains)) # create graph graph = nx.Graph() graph.add_nodes_from(samps) for strain_index, _ in enumerate(samps): for dist_index, dist in enumerate(dists[strain_index]): dist = int(dist) if strain_index == dist_index or dist > threshold: continue graph.add_edge(samps[strain_index], samps[dist_index], weight=dist) # extract interconnected subgraph # count sample not found samps2 = set(samps) grps = [] for sub_graph in [ graph.subgraph(c) for c in nx.connected_components(graph) ]: inds = [] for node in sub_graph.nodes(): samps2.remove(node) inds.append(samps.index(node)) grps.append(inds) grps.sort(key=len, reverse=True) # write result if export == 'group': for i, group in enumerate(grps): output.write('Group' + str(i)) for node in group: output.write(" " + samps[node]) output.write("\n") elif export == 'count': output.write('Group\t' + '\t'.join(samps) + '\n') for i, group in enumerate(grps): line = len(samps) * [0] for node in group: line[node] = 1 output.write(str(i) + '\t' + '\t'.join(map(str, line)) + '\n') else: for i, group in enumerate(grps): for node in group: output.write('Group' + str(i) + '\t' + samps[node] + '\n')