Esempio n. 1
0
    def to_identifier_df(self, engine, is_lfn_variant_replicate):
        """Returns a list of dictionnaries with run_id, marker_id, sample_id entries (See return)

        :return: pandas.DataFrame: with columns run_id, marker_id, ...
        """

        df = self.read_tsv_into_df(is_lfn_variant_replicate)

        df.run_name = NameIdConverter(df.run_name.tolist(), engine).to_ids(Run)
        df.marker_name = NameIdConverter(df.marker_name.tolist(),
                                         engine).to_ids(Marker)

        variant_id_user_lst = df.variant_id.tolist()
        df['variant_id'] = NameIdConverter(df.variant_sequence.tolist(),
                                           engine).variant_sequence_to_id()
        if not df.variant_id.tolist() == variant_id_user_lst:
            Logger.instance().warning(
                VTAMexception(
                    "Some variant IDs and sequences do not agree in the --cutoff_specific file and in the database."
                ))

        df.rename({
            'run_name': 'run_id',
            'marker_name': 'marker_id'
        },
                  axis=1,
                  inplace=True)

        return df
Esempio n. 2
0
 def is_valid(self):
     """Check if user parameter set is contained in the default parameter set"""
     for k in self.params_file_dic:
         if not (k in self.params_default_dic):
             Logger.instance().error(
                 VTAMexception(
                     'Non-valid parameter "{}" in the file "{}"'.format(
                         k, self.params_path)))
             sys.exit(1)
     return True
Esempio n. 3
0
    def download_precomputed_taxonomy(self):
        """
        Copy the online TSV taxonomy DB
        to the pathname output
        """
        Logger.instance().debug(
            "file: {}; line: {}; Downloading taxonomy tsv".format(
                __file__,
                inspect.currentframe().f_lineno,
            ))

        ############################################################################################
        #
        # Download sorted reads dataset
        #
        ############################################################################################

        taxonomy_tsv_gz_path = '{}.gz'.format(self.taxonomy_tsv_path)
        # Test first in local dir, otherwise in the remote URLs
        if not os.path.isfile(self.taxonomy_tsv_path) or pathlib.Path(
                self.taxonomy_tsv_path).stat().st_size < 1000000:
            try:
                # urllib.request.urlretrieve(taxonomy_tsv_gz_url1, taxonomy_tsv_gz_path, MyProgressBar())
                with tqdm(...) as t:
                    t.set_description(os.path.basename(taxonomy_tsv_gz_url1))
                    urllib.request.urlretrieve(taxonomy_tsv_gz_url1,
                                               taxonomy_tsv_gz_path,
                                               reporthook=tqdm_hook(t))
            except Exception:
                try:
                    # urllib.request.urlretrieve(taxonomy_tsv_gz_url2, taxonomy_tsv_gz_path,
                    #                            MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(
                            os.path.basename(taxonomy_tsv_gz_url2))
                        urllib.request.urlretrieve(taxonomy_tsv_gz_url2,
                                                   taxonomy_tsv_gz_path,
                                                   reporthook=tqdm_hook(t))
                except Exception:
                    # urllib.request.urlretrieve(taxonomy_tsv_gz_url3, taxonomy_tsv_gz_path,
                    #                            MyProgressBar())
                    with tqdm(...) as t:
                        t.set_description(
                            os.path.basename(taxonomy_tsv_gz_url1))
                        urllib.request.urlretrieve(taxonomy_tsv_gz_url3,
                                                   taxonomy_tsv_gz_path,
                                                   reporthook=tqdm_hook(t))
            with gzip.open(taxonomy_tsv_gz_path, 'rb') as fin:
                with open(self.taxonomy_tsv_path, 'wb') as fout:
                    shutil.copyfileobj(fin, fout)
            try:
                pathlib.Path(taxonomy_tsv_gz_path).unlink()
            except FileNotFoundError:
                pass
Esempio n. 4
0
    def read_tsv_into_df(self, is_lfn_variant_replicate):
        """Read into stats_df
        Updated: June 3, 2020

        Parameters
        ----------
        is_lfn_variant_replicate: Boolean that tells if algorithm is lfn_variant_replicate or not

        Returns
        -------
        pandas.DataFrame

        """

        df = pandas.read_csv(self.cutoff_specific_tsv, sep="\t", header=0)
        df.columns = df.columns.str.lower()
        df.rename(
            {
                'lfn_variant_cutoff': 'cutoff',
                'lfn_variant_replicate_cutoff': 'cutoff'
            },
            inplace=True,
            axis=1)

        if is_lfn_variant_replicate and set(df.columns.tolist()) >= {
                'run', 'marker', 'variant', 'replicate', 'cutoff', 'sequence'
        }:
            df = df[[
                'run', 'marker', 'variant', 'replicate', 'cutoff', 'sequence'
            ]]
        elif not is_lfn_variant_replicate and set(df.columns.tolist()) >= {
                'run', 'marker', 'variant', 'cutoff', 'sequence'
        }:
            df = df[['run', 'marker', 'variant', 'cutoff', 'sequence']]
        else:
            Logger.instance().critical(
                VTAMexception(
                    "The format of file '{}' is wrong. Columns 'lfn_variant_cutoff' or 'lfn_variant_replicate_cutoff' are required."
                    .format(self.cutoff_specific_tsv)))
            sys.exit(1)

        df.rename(
            {
                'run': 'run_name',
                'marker': 'marker_name',
                'variant': 'variant_id',
                'sequence': 'variant_sequence'
            },
            axis=1,
            inplace=True)

        return df
Esempio n. 5
0
    def main(arg_parser_dic):

        ###################################################################
        #
        # Create FilterLFNreference table and fill it
        #
        ###################################################################

        engine = sqlalchemy.create_engine('sqlite:///{}'.format(
            str(arg_parser_dic['db'])),
                                          echo=False)
        meta = sqlalchemy.MetaData()
        filter_lfn_reference = sqlalchemy.Table(
            'FilterLFNreference',
            meta,
            sqlalchemy.Column('filter_id',
                              sqlalchemy.Integer,
                              primary_key=True),
            sqlalchemy.Column('filter_name', sqlalchemy.String),
        )
        meta.create_all(engine)

        with engine.connect() as conn:
            for filter_rec in FilterLFNreference_records:
                filter_name = filter_rec['filter_name']
                select_row = conn.execute(
                    sqlalchemy.select([
                        filter_lfn_reference.c.filter_id
                    ]).where(filter_lfn_reference.c.filter_name ==
                             filter_name)).first()
                if select_row is None:  # variant_sequence IS NOT in the database, so INSERT it
                    conn.execute(
                        filter_lfn_reference.insert().values(**filter_rec))

        wopmars_runner = RunnerWopmars(command=arg_parser_dic['command'],
                                       cli_args_dic=arg_parser_dic)
        wopmars_command = wopmars_runner.get_wopmars_command()

        ########################################################################################
        #
        # Run wopmars
        #
        ########################################################################################

        # Some arguments will be passed through environmental variables
        if 'threads' in arg_parser_dic:
            os.environ['VTAM_THREADS'] = str(arg_parser_dic['threads'])
        Logger.instance().info(wopmars_command)
        run_result = subprocess.run(wopmars_command, shell=True)

        sys.exit(run_result.returncode)
Esempio n. 6
0
    def process_blast_result(blast_output_tsv):
        """Reads blast_output_tsv and creates a DF that is compatible to the following taxassign. If this DF is empty, vtam will exit with a warning

        """

        Logger.instance().debug(
            "file: {}; line: {}; Reading Blast output from: {}".format(
                __file__,
                inspect.currentframe().f_lineno, blast_output_tsv))
        blast_output_df = pandas.read_csv(blast_output_tsv,
                                          sep='\t',
                                          header=None,
                                          names=[
                                              'variant_id', 'target_id',
                                              'identity', 'evalue', 'coverage',
                                              'target_tax_id'
                                          ])
        # Remove null target tax ids
        blast_output_df = blast_output_df.loc[~blast_output_df.target_tax_id.
                                              isnull()]

        # expand multiple target_tax_ids
        # first convert as string
        blast_output_df.target_tax_id = blast_output_df.target_tax_id.astype(
            'str')
        # split by ';' to keep just one target_tax_id and reassign in DF
        blast_output_df.target_tax_id = blast_output_df.target_tax_id.str.split(
            pat=';', n=1, expand=True)
        # Convert back to numeric/int
        blast_output_df.target_tax_id = blast_output_df.target_tax_id.astype(
            'float').astype('int')
        # blast_output_df = (pandas.concat([
        #     blast_output_df, blast_output_df.target_tax_id.str.split(pat=';', n=1, expand=True)],
        #         axis=1))
        # blast_output_df.drop(['target_tax_id'], axis=1, inplace=true)
        # Blast output extract
        """   variant_id  target_id  identity        evalue  coverage  target_tax_id
0           2  MF7836761    99.429  1.620000e-86       100        1469487
1           2  MF7836761    99.429  1.620000e-86       100         189839
2           2  KY2618191    98.857  7.520000e-85       100         189839
3           2  MF7834791    98.857  7.520000e-85       100         189839
4           2  KU9559321    98.857  7.520000e-85       100         189839
"""

        if blast_output_df.shape[0] == 0:
            Logger.instance().warning(
                VTAMexception("Blast did not find any target. "
                              "VTAM will stop here."))
            sys.exit(0)
        return blast_output_df
Esempio n. 7
0
    def variant_sequence_to_id(self):

        variant_id_lst = []
        with self.engine.connect() as conn:
            for sequence in self.id_name_or_sequence_list:
                result = conn.execute(
                    sqlalchemy.select([Variant.__table__.c.id]).where(
                        Variant.__table__.c.sequence == sequence)).first()
                if result is None:
                    Logger.instance().error(
                        "Sequence {} not found in table {}".format(
                            sequence, str(Variant.__table__)))
                    sys.exit(1)
                variant_id_lst.append(result[0])
        return variant_id_lst
Esempio n. 8
0
    def run(self):
        """Run the vsearch

        :return: void
        """
        cmd = self.create_command()

        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        run_result = subprocess.run(args=args, capture_output=True)

        Logger.instance().info(run_result.stdout.decode())
        Logger.instance().info(run_result.stderr.decode())
Esempio n. 9
0
    def create_command(self):
        """Create the vsearch command that will be run_name

        :return: void
        """

        command = 'vsearch'
        for param in self.parameters:
            if not self.parameters[param] is None:
                command += ' --{} {}'.format(param, self.parameters[param])
            else:
                command += ' --{}'.format(param)
        Logger.instance().debug(command)

        return command
Esempio n. 10
0
    def to_names(self, declarative_model):

        nameid_lst = []
        with self.engine.connect() as conn:
            for idx in self.id_name_or_sequence_list:
                result = conn.execute(
                    sqlalchemy.select([
                        declarative_model.__table__.c.name
                    ]).where(declarative_model.__table__.c.id == idx)).first()
                if result is None:
                    Logger.instance().error(
                        "Id {} not found in table {}".format(
                            idx, str(declarative_model.__table__)))
                    sys.exit(1)
                nameid_lst.append(result[0])
        return nameid_lst
Esempio n. 11
0
    def variant_id_is_chimera_borderline(self):

        chimera_borderline_lst = []
        with self.engine.connect() as conn:
            for variant_id in self.id_name_or_sequence_list:
                result = conn.execute(
                    sqlalchemy.select([
                        FilterChimeraBorderline.__table__.c.filter_delete
                    ]).where(FilterChimeraBorderline.__table__.c.variant_id ==
                             variant_id).distinct()).first()
                if result is None:
                    Logger.instance().error(
                        "Variant ID {} not found in table FilterChimeraBorderline"
                        .format(variant_id))
                    sys.exit(1)
                chimera_borderline_lst.append(result[0])
        return chimera_borderline_lst
Esempio n. 12
0
    def __init__(self, asv_table_df, readcounts, run_marker_df=None):
        """
        Constructor of the CommandPoolRunMarkers class

        Parameters
        ----------
        asv_table_df : pandas dataframe
            ASV table.
        readcount : bool
            Default false.
            If false, boolean 0/1 is given for presence or absence of variant in pooled table.
            If true, read integer is given with sum or reads in the pooled runs or markers.
        run_marker_df: pandas dataframe
            Output ASV table with pooled variants
        """

        header = {
            'run_name', 'marker_name', 'variant_id', 'sequence_length',
            'read_count'
        }
        if not set(asv_table_df.columns
                   ) >= header:  # contains at least the 'header_lower' columns
            Logger.instance().error(
                VTAMexception(
                    "The ASV table structure is wrong. It is expected to contain these columns: "
                    "run_name, marker_name, variant_id, sequence_length, read_count"
                ))
            sys.exit(1)

        self.sample_names = asv_table_df.columns.tolist()[5:-2]

        if run_marker_df is None:  # Default: pool all marker_name
            self.asv_table_df = asv_table_df
        else:  # if run_marker_df: pool only markers in this variant_read_count_input_df
            self.asv_table_df = asv_table_df.merge(
                run_marker_df, on=['run_name', 'marker_name'])

        self.tmp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                    os.path.basename(__file__))
        pathlib.Path(self.tmp_dir).mkdir(exist_ok=True)

        self.cluster_path = None  # returned by run_vsearch_to_cluster_sequences

        self.cluster_df = None  # returned by get_vsearch_clusters_to_df
        self.readcounts = readcounts  # returned by get_vsearch_clusters_to_df
Esempio n. 13
0
 def __download_ncbi_taxonomy_dump(self):
     # Download files
     remotefile = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"
     new_taxdump_path = os.path.join(self.tempdir,
                                     os.path.basename(remotefile))
     Logger.instance().debug(
         "file: {}; line: {}; Downloading NCBI taxonomy dump".format(
             __file__,
             inspect.currentframe().f_lineno))
     if not os.path.isfile(new_taxdump_path):
         Logger.instance().info("Downloading NCBI taxonomy dump")
         # urllib.request.urlretrieve(remotefile, new_taxdump_path, MyProgressBar())
         with tqdm(...) as t:
             t.set_description(os.path.basename(new_taxdump_path))
             urllib.request.urlretrieve(remotefile,
                                        new_taxdump_path,
                                        reporthook=tqdm_hook(t))
     return new_taxdump_path
Esempio n. 14
0
    def get_one_tax_id_lineage(self, tax_id):
        """
        Takes a tax_id and creates a dictionary with the taxonomy lineage in
        this form {'species': 183142, 'genus': 10194, 'family': 10193, 'order': 84394,
     'superorder': 1709201, 'class': 10191, 'phylum': 10190, 'no rank': 131567, 'kingdom': 33208,
     'superkingdom': 2759}

        Parameters
        ----------
        tax_id : int
            NCBI taxon id

        Returns
        -------
        dic
        Dictionnary with taxonomy lineage for given tax_id

        """

        # lineage_dic = {'tax_id': tax_id}
        lineage_dic = {}
        while tax_id != 1:
            # tax_id is found as normal tax in the taxonomy file
            if tax_id in self.df.index:
                tax_id_row = self.df.loc[tax_id, ]
            # tax_id is found as old_tax_id column in the taxonomy file
            elif tax_id in self.old_tax_df.index.tolist():  # Try old tax id
                tax_id_new = self.old_tax_df.loc[tax_id, 'tax_id']
                tax_id_row = self.df.loc[tax_id_new, ]
            # tax_id is not found in the taxonomy file.
            # Return current lineage dic and exit the function
            else:
                Logger.instance().warning(
                    "The taxon ID {} in the Blast database is missing in the taxonomy.tsv. "
                    "Consider updating this file with the following command: vtam taxonomy --output taxonomy.tsv."
                    .format(tax_id))
                # raise VTAMexception("tax_id {} from Blast database not found in the taxonomy.tsv file".format(tax_id))
                return lineage_dic
            rank = tax_id_row['rank']
            parent_tax_id = tax_id_row['parent_tax_id']
            lineage_dic[rank] = tax_id
            tax_id = parent_tax_id
        return lineage_dic
Esempio n. 15
0
    def get_several_tax_id_lineages(self, tax_id_list):
        """
        Takes a list of tax_id's and creates a DataFrame with the taxonomy lineages in columns
        and the tax_id as index

tax_id (index)  no rank    species     genus     family     order     class
1246992   131567   741276.0    5533.0  1799696.0  231213.0  162481.0    29000.0
1112827   131567  1112827.0    6220.0   941271.0    6219.0    6218.0

        Parameters
        ----------
        tax_id : int
            NCBI taxon id

        Returns
        -------
        DataFrame
        DataFrame with lineages in columns and tax_id as index

        """

        lineage_list = []
        for target_tax_id_i, target_tax_id in enumerate(tax_id_list):
            if target_tax_id_i % 100 == 0:
                Logger.instance().debug(
                    "Get lineage of {}-th tax id {} (Total {} tax ids)".format(
                        target_tax_id_i, target_tax_id, len(tax_id_list)))
            lineage_list.append({
                **{
                    'tax_id': target_tax_id
                },
                **self.get_one_tax_id_lineage(tax_id=target_tax_id)
            })
        tax_id_lineage_df = pandas.DataFrame(lineage_list)
        tax_id_lineage_df.set_index('tax_id',
                                    drop=True,
                                    inplace=True,
                                    verify_integrity=True)
        return tax_id_lineage_df
Esempio n. 16
0
    def run_local_blast(self):
        """Runs a local blast and returns the path to the output TSV file"""

        #######################################################################
        #
        # 3 Run local blast
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Running local blast with FASTA input {}".
            format(__file__,
                   inspect.currentframe().f_lineno, self.variant_fasta))

        # Run and read local blast result
        blast_output_tsv = os.path.join(self.this_temp_dir, 'blast_output.tsv')
        # blast_output_tsv = "/home/gonzalez/tmp/blast/blast_output.tsv" # uncomment for testing
        # get blast db dir and filename prefix from NHR file
        os.environ['BLASTDB'] = self.blast_db_dir

        blastn_cline = NcbiblastnCommandline(
            query=self.variant_fasta,
            db=self.blast_db_name,
            evalue=1e-5,
            outfmt='"6 qseqid sacc pident evalue qcovhsp staxids"',
            dust='yes',
            qcov_hsp_perc=self.qcov_hsp_perc,
            num_threads=self.num_threads,
            out=blast_output_tsv)
        Logger.instance().debug("file: {}; line: {}; {}".format(
            __file__,
            inspect.currentframe().f_lineno, str(blastn_cline)))
        #
        # Run blast
        stdout, stderr = blastn_cline()
        return blast_output_tsv
Esempio n. 17
0
    def main(fastainfo,
             fastadir,
             sorteddir,
             params=None,
             num_threads=multiprocessing.cpu_count()):

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(
            fastainfo).read_tsv_into_df()

        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        sorted_read_info_df = pandas.DataFrame()

        for i in range(0, merged_fastainfo_df.shape[0]):
            fasta_info_series = merged_fastainfo_df.iloc[i]

            tag_fwd = fasta_info_series.tagfwd
            tag_rev = fasta_info_series.tagrev
            primer_fwd = fasta_info_series.primerfwd
            primer_rev = fasta_info_series.primerrev
            in_fasta_basename = fasta_info_series.mergedfasta

            Logger.instance().debug(
                "Analysing FASTA file: {}".format(in_fasta_basename))

            fasta_info_df_i = fasta_info_series.to_frame().T
            in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename)

            ########################################################################################
            #
            # Cut adapt tag of forward reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_rev_rc = str(
                    Seq(tag_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_rev_rc = str(Seq(tag_rev).reverse_complement())

            out_fasta_basename = os.path.basename(in_raw_fasta_path).replace(
                '.fasta', '_sorted_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_fwd,
                'tag_fwd_len': len(tag_fwd),
                'tag_rev_rc': tag_rev_rc,
                'tag_rev_rc_len': len(tag_rev_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args,
                                        capture_output=True,
                                        check=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                primer_rev_rc = str(
                    Seq(primer_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_rev_rc = str(Seq(primer_rev).reverse_complement())

            in_fasta_path = out_fasta_path
            out_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_fwd,
                'primer_fwd_len': len(primer_fwd),
                'primer_rev_rc': primer_rev_rc,
                'primer_rev_rc_len': len(primer_rev_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }

            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                                      '--minimum-length {read_min_length} ' \
                                      '--maximum-length {read_max_length} --trimmed-only  ' \
                                      '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" '  \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Cut adapt tag of reverse-complement reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_fwd_rc = str(
                    Seq(tag_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_fwd_rc = str(Seq(tag_fwd).reverse_complement())

            out_rc_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta',
                                           '_rc_sorted_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_rev,
                'tag_fwd_len': len(tag_rev),
                'tag_rev_rc': tag_fwd_rc,
                'tag_rev_rc_len': len(tag_fwd_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            #
            ###################################################################

            if generic_dna:  # Biopython <1.78
                primer_fwd_rc = str(
                    Seq(primer_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_fwd_rc = str(Seq(primer_fwd).reverse_complement())

            in_fasta_path = out_rc_fasta_path
            out_rc_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_rc_sorted_%03d.fasta' % i,
                '_rc_sorted_trimmed_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_rev,
                'primer_fwd_len': len(primer_rev),
                'primer_rev_rc': primer_fwd_rc,
                'primer_rev_rc_len': len(primer_fwd_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }
            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                '--minimum-length {read_min_length} ' \
                '--maximum-length {read_max_length} --trimmed-only  ' \
                '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Reverse complement back rc fasta and pool
            #
            ###################################################################

            out_final_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i)
            out_final_fasta_path = os.path.join(sorteddir,
                                                out_final_fasta_basename)
            shutil.copy(out_fasta_path, out_final_fasta_path)

            Logger.instance().debug("Pooling fwd and rc reads...")
            with open(out_final_fasta_path, 'a') as fout:
                with open(out_rc_fasta_path, 'r') as fin:
                    for line in fin:
                        if not line.startswith('>'):

                            if generic_dna:  # Biopython <1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip(),
                                        generic_dna).reverse_complement()))
                            else:  # Biopython =>1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip()).reverse_complement()))

                        else:
                            fout.write(line)

            fasta_info_df_i = fasta_info_df_i[[
                'run', 'marker', 'sample', 'replicate'
            ]]
            fasta_info_df_i['sortedfasta'] = out_final_fasta_basename
            sorted_read_info_df = pandas.concat(
                [sorted_read_info_df, fasta_info_df_i], axis=0)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sorted_read_info_df.to_csv(fasta_trimmed_info_tsv,
                                   sep="\t",
                                   header=True,
                                   index=False)
Esempio n. 18
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file
        fasta_info_tsv = self.input_file(
            FilterRenkonen.__input_file_sortedinfo)
        #
        # Input table models
        input_filter_chimera_model = self.input_table(
            FilterRenkonen.__input_table_chimera)
        #
        # Options
        renkonen_distance_quantile = float(
            self.option("renkonen_distance_quantile"))
        #
        # Output table models
        output_filter_renkonen_model = self.output_table(
            FilterRenkonen.__output_table_filter_renkonen)

        ############################################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_renkonen_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_chimera_model,
            engine=engine,
            filter_id=None)

        ############################################################################################
        #
        # Run per run_id, marker_id
        #
        ############################################################################################

        variant_read_count_delete_df = pandas.DataFrame()
        run_marker_df = variant_read_count_df[['run_id',
                                               'marker_id']].drop_duplicates()

        for row in run_marker_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id

            variant_read_count_per_run_marker_df = variant_read_count_df.loc[
                (variant_read_count_df.run_id == run_id)
                & (variant_read_count_df.marker_id == marker_id)]

            if variant_read_count_per_run_marker_df.replicate.unique(
            ).shape[0] > 1:  # if more than one replicate
                filter_renkonen_runner_obj = RunnerFilterRenkonen(
                    variant_read_count_per_run_marker_df)
                filter_output_i_df = filter_renkonen_runner_obj.get_variant_read_count_delete_df(
                    renkonen_distance_quantile)
            else:  # Just one replicate
                filter_output_i_df = variant_read_count_df.copy()
                filter_output_i_df['filter_delete'] = False

            variant_read_count_delete_df = pandas.concat(
                [variant_read_count_delete_df, filter_output_i_df], axis=0)

        ############################################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        ############################################################################################

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_renkonen_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception("This filter has deleted all the variants: {}. "
                              "The analysis will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)
Esempio n. 19
0
    def get_nijk_df(self,
                    variant_read_count_like_model,
                    engine,
                    filter_id=None):
        """Based on the SortedReadFile samples and the variant_read_count_model, returns the variant_read_count_input_df

        :param variant_read_count_like_model: SQLalchemy models with columns: run_id, marker_id, sample_id, replicate, variant_id, read_count
        :param filter_id:
        :return: DataFrame with columns: run_id, marker_id, sample_id, replicate, variant_id, read_count
        """

        variant_read_count_like_table = variant_read_count_like_model.__table__

        variant_read_count_list = []
        # for sample_instance in self.get_fasta_information_record_list():
        for sample_instance_row in self.to_identifier_df(
                engine=engine).itertuples():
            run_id = sample_instance_row.run_id
            marker_id = sample_instance_row.marker_id
            sample_id = sample_instance_row.sample_id
            replicate = sample_instance_row.replicate
            stmt_select = sqlalchemy.select([
                variant_read_count_like_table.c.run_id,
                variant_read_count_like_table.c.marker_id,
                variant_read_count_like_table.c.sample_id,
                variant_read_count_like_table.c.replicate,
                variant_read_count_like_table.c.variant_id,
                variant_read_count_like_table.c.read_count
            ]).distinct(
            ).where(variant_read_count_like_table.c.run_id == run_id).where(
                variant_read_count_like_table.c.marker_id == marker_id).where(
                    variant_read_count_like_table.c.sample_id == sample_id
                ).where(variant_read_count_like_table.c.replicate == replicate)
            # Used for filters tables where filter_delete attribute exists
            if 'filter_delete' in [
                    column.key
                    for column in variant_read_count_like_table.columns
            ]:
                stmt_select = stmt_select.where(
                    variant_read_count_like_table.c.filter_delete == 0)
            # used for filter lfn where filter_id = 8 is necessary (do not pass
            # all filters)
            if filter_id is not None:
                stmt_select = stmt_select.where(
                    variant_read_count_like_table.c.filter_id == filter_id)
            with engine.connect() as conn2:
                for row in conn2.execute(stmt_select).fetchall():
                    variant_read_count_list.append(row)
        #
        variant_read_count_df = pandas.DataFrame.from_records(
            variant_read_count_list,
            columns=[
                'run_id', 'marker_id', 'sample_id', 'replicate', 'variant_id',
                'read_count'
            ])

        # Exit if no variants for analysis
        try:
            assert variant_read_count_df.shape[0] > 0
        except AssertionError:
            Logger.instance().warning(
                VTAMexception("No variants available after this filter. "
                              "The pipeline will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)
        return variant_read_count_df
Esempio n. 20
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        #######################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        #######################################################################

        # Input file
        # sort_reads_tsv = self.input_file(VariantReadCount.__input_file_sort_reads)
        input_file_sortedinfo = self.input_file(
            VariantReadCount.__input_file_sortedinfo)
        #
        # Input table models
        run_model = self.input_table(VariantReadCount.__input_table_run)
        marker_model = self.input_table(VariantReadCount.__input_table_marker)
        sample_model = self.input_table(VariantReadCount.__input_table_sample)
        #
        # Output
        # Output table
        variant_model = self.output_table(
            VariantReadCount.__output_table_variant)
        variant_read_count_model = self.output_table(
            VariantReadCount.__output_table_variant_read_count)
        # Options
        read_dir = self.option("read_dir")
        global_read_count_cutoff = self.option("global_read_count_cutoff")

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Read tsv file with sorted reads
        # 4. Group by read sequence
        # 5. Delete variants if below global_read_count_cutoff
        # 6. Insert into Variant and DataframeVariantReadCountLike tables
        #
        #######################################################################

        #######################################################################
        #
        # 1. Read sample information to get run_id, marker_id, sample_id, replicate for current analysis
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Read sample information".format(
                __file__,
                inspect.currentframe().f_lineno))
        sortedinfo_df = pandas.read_csv(input_file_sortedinfo,
                                        sep="\t",
                                        header=0)
        sample_instance_list = []
        sortedinfo_df.columns = sortedinfo_df.columns.str.lower()

        for row in sortedinfo_df.itertuples():
            Logger.instance().debug(row)
            marker_name = row.marker
            run_name = row.run
            sample_name = row.sample
            replicate = row.replicate
            with engine.connect() as conn:
                # get run_id ###########
                stmt_select_run_id = select([
                    run_model.__table__.c.id
                ]).where(run_model.__table__.c.name == run_name)
                run_id = conn.execute(stmt_select_run_id).first()[0]
                # get marker_id ###########
                stmt_select_marker_id = select([
                    marker_model.__table__.c.id
                ]).where(marker_model.__table__.c.name == marker_name)
                marker_id = conn.execute(stmt_select_marker_id).first()[0]
                # get sample_id ###########
                stmt_select_sample_id = select([
                    sample_model.__table__.c.id
                ]).where(sample_model.__table__.c.name == sample_name)
                sample_id = conn.execute(stmt_select_sample_id).first()[0]
                # add this sample_instance ###########
                sample_instance_list.append({
                    'run_id': run_id,
                    'marker_id': marker_id,
                    'sample_id': sample_id,
                    'replicate': replicate
                })

        #######################################################################
        #
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Delete marker_name/run_name/sample/replicate".
            format(__file__,
                   inspect.currentframe().f_lineno))

        with engine.connect() as conn:
            stmt_del = variant_read_count_model.__table__.delete()
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      run_id == bindparam('run_id'))
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      marker_id == bindparam('marker_id'))
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      sample_id == bindparam('sample_id'))
            stmt_del = stmt_del.where(variant_read_count_model.__table__.c.
                                      replicate == bindparam('replicate'))
            conn.execute(stmt_del, sample_instance_list)

        #######################################################################
        #
        # 3. Read tsv file with sorted reads
        #
        #######################################################################

        # fasta_info_obj = FastaInformationTSV(input_file_sortedinfo, engine=engine)
        # sample_info_ids_df = fasta_info_obj.get_ids_df()
        sample_info_tsv_obj = FileSampleInformation(
            tsv_path=input_file_sortedinfo)
        sample_info_ids_df = sample_info_tsv_obj.to_identifier_df(
            engine=engine)

        Logger.instance().debug(
            "file: {}; line: {}; Read demultiplexed FASTA files".format(
                __file__,
                inspect.currentframe().f_lineno))

        variant_read_count_df = pandas.DataFrame()

        for row in sample_info_ids_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id
            sample_id = row.sample_id
            replicate = row.replicate
            read_fasta = row.sortedfasta

            Logger.instance().debug(
                "file: {}; line: {}; Read FASTA: {}".format(
                    __file__,
                    inspect.currentframe().f_lineno, read_fasta))

            read_fasta_path = os.path.join(read_dir, read_fasta)

            if os.path.exists(read_fasta_path):

                ####################################################################################
                #
                # Read FASTA
                #
                ####################################################################################

                sorted_read_list = VariantReadCount.get_sorted_read_list(
                    read_fasta_path, generic_dna)

                variant_read_count_df_sorted_i = pandas.DataFrame({
                    'run_id': [run_id] * len(sorted_read_list),
                    'marker_id': [marker_id] * len(sorted_read_list),
                    'sample_id': [sample_id] * len(sorted_read_list),
                    'replicate': [replicate] * len(sorted_read_list),
                    'read_sequence':
                    sorted_read_list,
                    'read_count': [1] * len(sorted_read_list)
                })
                #  Compute read count
                variant_read_count_df_sorted_i = variant_read_count_df_sorted_i.groupby(
                    [
                        'run_id', 'marker_id', 'sample_id', 'replicate',
                        'read_sequence'
                    ]).sum().reset_index()

                #variant_read_count_df = variant_read_count_df.append(
                #    variant_read_count_df_sorted_i)
                variant_read_count_df = pandas.concat(
                    [variant_read_count_df, variant_read_count_df_sorted_i],
                    axis=0)

            else:
                Logger.instance().warning(
                    'This file {} doest not exists'.format(read_fasta_path))

        #######################################################################
        #
        # 4. Group by read sequence to variant_read_count with run_id, marker_name, ...
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Group by read sequence".format(
                __file__,
                inspect.currentframe().f_lineno))
        variant_read_count_df = variant_read_count_df.groupby(
            ['run_id', 'marker_id', 'sample_id', 'replicate',
             'read_sequence']).sum().reset_index()
        variant_read_count_df.rename(columns={'read_sequence': 'variant_id'},
                                     inplace=True)
        variant_read_count_df.sort_values(
            by=variant_read_count_df.columns.tolist())

        #######################################################################
        #
        # 5. Remove variants with read count across all run_name, markers, samples and replicates lower than
        # global_read_count_cutoff parameter
        #
        #######################################################################

        variant_read_count_like_df_obj = DataframeVariantReadCountLike(
            variant_read_count_df)
        Logger.instance().debug(
            "file: {}; line: {}; Remove variants with global read count lower than parameter 'global_read_count_cutoff'"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        variant_read_count_df = variant_read_count_like_df_obj.filter_out_below_global_read_count_cutoff(
            global_read_count_cutoff=global_read_count_cutoff)
        variant_read_count_df.rename(
            columns={'variant_id': 'variant_sequence'}, inplace=True)

        #######################################################################
        #
        # 6. Insert into Variant and VariantReadCount tables
        #
        #######################################################################

        Logger.instance().debug("file: {}; line: {}; Insert variants".format(
            __file__,
            inspect.currentframe().f_lineno))
        variant_read_count_instance_list = []
        variant_read_count_df.sort_values(by=[
            'variant_sequence', 'run_id', 'marker_id', 'sample_id', 'replicate'
        ],
                                          inplace=True)
        variant_new_set = set()
        variant_new_instance_list = []
        with engine.connect() as conn:
            # Retrieve maximal variant id if possible
            select_variant_id_max = conn.execute(
                sqlalchemy.select([func.max(variant_model.__table__.c.id)
                                   ])).first()[0]
            if select_variant_id_max is None:
                select_variant_id_max = 0  # If no variants, then maximal variant id is 0
            for row in variant_read_count_df.itertuples():
                run_id = row.run_id
                marker_id = row.marker_id
                sample_id = row.sample_id
                replicate = row.replicate
                variant_sequence = row.variant_sequence
                read_count = row.read_count
                select_row = conn.execute(
                    sqlalchemy.select([
                        variant_model.__table__.c.id
                    ]).where(variant_model.__table__.c.sequence ==
                             variant_sequence)).first()
                if select_row is None:  # variant_sequence IS NOT in the database, so will INSERT it
                    if not (variant_sequence in variant_new_set):
                        variant_id = select_variant_id_max + \
                            len(variant_new_instance_list) + 1
                        variant_new_set.add(variant_sequence)
                        variant_new_instance_list.append({
                            'id':
                            variant_id,
                            'sequence':
                            variant_sequence
                        })
                else:  # variant_sequence IS in the database
                    variant_id = select_row[0]
                variant_read_count_instance_list.append({
                    'run_id':
                    run_id,
                    'marker_id':
                    marker_id,
                    'variant_id':
                    variant_id,
                    'sample_id':
                    sample_id,
                    'replicate':
                    replicate,
                    'read_count':
                    read_count
                })

        #######################################################################
        #
        # Exit if variant_read_count_instance_list empty
        #
        #######################################################################

        if not len(variant_read_count_instance_list):
            Logger.instance().warning(
                VTAMexception(
                    "No new variants in these samples. Maybe singletons? The analysis will stop here."
                    .format(self.__class__.__name__)))
            sys.exit(0)

        #######################################################################
        #
        # Write variant_read_count table
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {};  Insert variant read count".format(
                __file__,
                inspect.currentframe().f_lineno))

        with engine.connect() as conn:

            # Insert if there some new variants
            if len(variant_new_instance_list) > 0:
                conn.execute(variant_model.__table__.insert(),
                             variant_new_instance_list)

            # Insert new variant_read_count_instances
            conn.execute(variant_read_count_model.__table__.insert(),
                         variant_read_count_instance_list)

        #######################################################################
        #
        # Touch output tables, to update modification date
        #
        #######################################################################

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()
Esempio n. 21
0
    def get_variant_read_count_delete_df(self, variant_df,
                                         uchime3_denovo_abskew):

        temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                os.path.basename(__file__))
        pathlib.Path(temp_dir).mkdir(exist_ok=True)

        filter_output_chimera_df = self.variant_read_count_df.copy()
        filter_output_chimera_df['filter_delete'] = False
        #
        filter_output_borderline_df = self.variant_read_count_df.copy()
        filter_output_borderline_df['filter_delete'] = False

        run_marker_sample_df = self.variant_read_count_df[[
            'run_id', 'marker_id', 'sample_id'
        ]].drop_duplicates(inplace=False)
        for row in run_marker_sample_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id
            sample_id = row.sample_id

            variant_read_count_df = self.variant_read_count_df.loc[
                (self.variant_read_count_df.run_id == run_id)
                & (self.variant_read_count_df.marker_id == marker_id) &
                (self.variant_read_count_df.sample_id == sample_id)]

            variant_read_count_df_obj = DataframeVariantReadCountLike(
                variant_read_count_df=variant_read_count_df)
            N_i_df = variant_read_count_df_obj.get_N_i_df()

            variant_size_df = variant_df.merge(N_i_df,
                                               left_index=True,
                                               right_on='variant_id')
            variant_size_df = variant_size_df[[
                'variant_id', 'sequence', 'N_i'
            ]]
            variant_size_df.rename(columns={'N_i': 'size'}, inplace=True)
            variant_size_df.set_index('variant_id', inplace=True)

            ###################################################################
            #
            # Sort variants by abundance and write to fasta_path
            #
            ###################################################################

            variant_size_df.sort_values(by='size',
                                        ascending=False,
                                        inplace=True)

            variant_df_utils_obj = DataframeVariant(variant_size_df)

            uchime_fasta_path = os.path.join(
                temp_dir, 'run_{}_marker_{}_sample_{}.fasta'.format(
                    run_id, marker_id, sample_id))
            variant_df_utils_obj.to_fasta(fasta_path=uchime_fasta_path,
                                          add_column="size")

            ###################################################################
            #
            # Run uchime_denovo
            #
            ###################################################################

            uchime_borderline_fasta_path = os.path.join(
                temp_dir, 'run_{}_marker_{}_sample_{}_borderline.fasta'.format(
                    run_id, marker_id, sample_id))
            uchime_nonchimeras_fasta_path = os.path.join(
                temp_dir,
                'run_{}_marker_{}_sample_id_{}_nonchimeras.fasta'.format(
                    run_id, marker_id, sample_id))
            uchime_chimeras_fasta_path = os.path.join(
                temp_dir, 'run_{}_marker_{}_sample_{}_chimeras.fasta'.format(
                    run_id, marker_id, sample_id))

            #
            # Create object and run_name vsearch
            vsearch_parameters = {
                'uchime3_denovo': uchime_fasta_path,
                'borderline': uchime_borderline_fasta_path,
                'nonchimeras': uchime_nonchimeras_fasta_path,
                'chimeras': uchime_chimeras_fasta_path,
                'abskew': uchime3_denovo_abskew,
            }
            vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters)
            vsearch_cluster.run()

            ###################################################################
            #
            # 4. Delete variant from replicate/sample if chimeras
            #
            ###################################################################

            Logger.instance().debug(
                "Vsearch uchime chimera tsv_path: {}".format(
                    uchime_chimeras_fasta_path))
            with open(uchime_chimeras_fasta_path, "r") as handle:
                for chimera_seqrecord in SeqIO.parse(handle, "fasta"):
                    variant_id = int(chimera_seqrecord.id.split(';')[0])
                    filter_output_chimera_df.loc[
                        (filter_output_chimera_df['run_id'] == run_id)
                        & (filter_output_chimera_df['marker_id'] == marker_id)
                        & (filter_output_chimera_df['sample_id'] == sample_id)
                        &
                        (filter_output_chimera_df['variant_id'] == variant_id),
                        'filter_delete'] = True

            Logger.instance().debug(
                "Vsearch uchime chimera borderline tsv_path: {}".format(
                    uchime_borderline_fasta_path))
            with open(uchime_borderline_fasta_path, "r") as handle:
                for chimera_seqrecord in SeqIO.parse(handle, "fasta"):
                    variant_id = int(chimera_seqrecord.id.split(';')[0])
                    filter_output_borderline_df.loc[
                        (filter_output_borderline_df['run_id'] == run_id)
                        &
                        (filter_output_borderline_df['marker_id'] == marker_id)
                        &
                        (filter_output_borderline_df['sample_id'] == sample_id)
                        & (filter_output_borderline_df['variant_id'] ==
                           variant_id), 'filter_delete'] = True

        return filter_output_chimera_df, filter_output_borderline_df
Esempio n. 22
0
    def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), 
        no_reverse=False, tag_to_end=False, primer_to_end=False):
        
        Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}")

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df()
        
        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        merged_fasta_list = []
        results_list = []
        sample_info = {}

        # make sure every file is analysed once.
        for i in range(merged_fastainfo_df.shape[0]):
            if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list:
                merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta)
            
        for mergedfasta in merged_fasta_list:

            inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end)
            
            tagFile_path = inputFiles.tags_file()
            info = inputFiles.get_df_info()

            for key in info.keys():
                if key in sample_info.keys():
                    sample_info[key] = sample_info[key] + info[key]
                else:
                    sample_info[key] = info[key]

            Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta))

            in_raw_fasta_path = os.path.join(fastadir, mergedfasta)

            ########################################################################################
            #
            #   cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile 
            #   --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path
            #
            ########################################################################################

            base = os.path.basename(in_raw_fasta_path)
            base, base_suffix = base.split('.', 1)
            
            out_fasta_path = os.path.join(tempdir, "sorted") 

            cmd_cutadapt_tag_dic = {
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
                'tagFile': tagFile_path,
                'base_suffix': base_suffix,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \
                .format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

            Logger.instance().info(run_result.stdout.decode())

            inputFiles.remove_tags_file()

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only 
            # --minimum-length minimum_length --maximum-length maximum_length 
            # --output input_path + {name} + suffix outputfile
            #
            ########################################################################################
            
            primers = inputFiles.primers()
            try:
                tags_samples = inputFiles.get_sample_names()
            except Exception as e:
                Logger.instance().error(e)
                return 
            
            for primer in primers:
                
                marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer

                for tag_sample in tags_samples:

                    name, run, marker2, sample, replicate, _, _ = tag_sample
                    
                    if marker not in marker2:
                        continue

                    in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix

                    baseMerge =  mergedfasta.split(".")[0]
                                        
                    outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed"
                    if name.endswith("_reversed"):
                        outname = outname + "_reversed"
                    out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix)

                    results_list.append(out_fasta_path_new)
                    
                    if not "_reversed" in name:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerrev, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerrev).reverse_complement())
                        primerFwd = primerfwd
                        lenPrimerFwd = lenprimerfwd
                        lenPrimerRev = lenprimerrev
                    else:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerfwd, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerfwd).reverse_complement())
                        primerFwd = primerrev
                        lenPrimerFwd = lenprimerrev
                        lenPrimerRev = lenprimerfwd


                    cmd_cutadapt_primer_dic = {
                        'in_fasta_path': in_fasta_path,
                        'out_fasta': out_fasta_path_new,
                        'error_rate': cutadapt_error_rate,
                        'num_threads': num_threads,
                        'primerFwd': primerFwd,
                        'primerRev': primerRev,
                        'lenPrimerFwd': lenPrimerFwd,
                        'lenPrimerRev': lenPrimerRev,
                        'read_min_length': cutadapt_minimum_length,
                        'read_max_length': cutadapt_maximum_length,
                    }

                    if not primer_to_end: #works if the command is selected
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)
                    else:
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\
                            '--output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)

                    Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str))

                    if sys.platform.startswith("win"):
                        args = cmd_cutadapt_primer_str
                    else:
                        args = shlex.split(cmd_cutadapt_primer_str)

                    run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

                    Logger.instance().info(run_result.stdout.decode())

        ###################################################################
        #
        # Reverse complement back rc fasta and pool
        #
        ###################################################################   
     
        for file in results_list:
            if "_trimmed" in file:

                out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1])
                in_fasta_path = os.path.join(tempdir, file)

                if out_final_fasta_path.endswith(".gz"):      
                    _open = partial(gzip.open) 
                elif out_final_fasta_path.endswith(".bz2"):
                    _open = partial(bz2.open)
                else:
                    _open = open

                if in_fasta_path.endswith(".gz"):
                    _open2 = partial(gzip.open) 
                elif in_fasta_path.endswith(".bz2"):
                    _open2 = partial(bz2.open) 
                else: 
                    _open2 = open

                if "_reversed" in file:
                    Logger.instance().debug("Pooling fwd and rc reads...")

                    out_final_fasta_path = out_final_fasta_path.replace("_reversed", "")

                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                if not line.startswith('>'):
                                    if generic_dna:  # Biopython <1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip(), generic_dna).reverse_complement()))
                                    else:  # Biopython =>1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip()).reverse_complement()))

                                else:
                                    fout.write(line)
                else:
                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                fout.write(line)
        
        results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result]

        del sample_info['mergedfasta']
        del sample_info['primerrev']
        del sample_info['primerfwd']
        del sample_info['tagrev']
        del sample_info['tagfwd']

        sample_info['sortedfasta'] = results_list

        sample_info_df = pandas.DataFrame(sample_info)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
Esempio n. 23
0
    def main(cls,
             db,
             mode,
             asvtable_tsv,
             output,
             taxonomy_tsv,
             blastdb_dir_path,
             blastdbname_str,
             num_threads=multiprocessing.cpu_count(),
             params=None):
        """

        Parameters
        ----------
        db: str
            Path to SQLITE database with Variant and Taxassign tables
        mode
        asvtable_tsv
        output
        taxonomy_tsv
        blastdb_dir_path
        blastdbname_str
        num_threads
        params

        Returns
        -------

        """

        this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                     os.path.basename(__file__))
        pathlib.Path(this_temp_dir).mkdir(exist_ok=True)

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        # params_dic = constants.get_params_default_dic()
        # params_dic = FileParams(params).get_params_dic()

        # ltg_rule_threshold = params_dic['ltg_rule_threshold']
        # include_prop = params_dic['include_prop']
        # min_number_of_taxa = params_dic['min_number_of_taxa']
        # qcov_hsp_perc = params_dic['qcov_hsp_perc']

        #######################################################################
        #
        # Load db and tables as classes and delete taxassign in reset mode
        #
        #######################################################################

        engine = sqlalchemy.create_engine('sqlite:///{}'.format(db),
                                          echo=False)

        variant_declarative_table = Variant.__table__
        variant_declarative_table.create(bind=engine, checkfirst=True)
        tax_assign_declarative_table = TaxAssign.__table__
        tax_assign_declarative_table.create(bind=engine, checkfirst=True)

        if mode == 'reset':
            with engine.connect() as conn:
                conn.execute(tax_assign_declarative_table.delete())

        #######################################################################
        #
        # Use variants that are not already already assigned in TaxAssign
        #
        #######################################################################

        variant_input_df = pandas.read_csv(asvtable_tsv, sep="\t", header=0)
        # get list of variant sequences
        variant_sequence_list = variant_input_df.sequence.tolist()

        # Add variant to DB if not already there
        for variant_sequence in variant_sequence_list:
            with engine.connect() as conn:
                row_variant = conn.execute(
                    sqlalchemy.select([
                        variant_declarative_table.c.id
                    ]).where(variant_declarative_table.c.sequence ==
                             variant_sequence)).first()
                if row_variant is None:  # variant_sequence IS NOT in the database, so INSERT it
                    conn.execute(variant_declarative_table.insert().values(
                        sequence=variant_sequence))

        #######################################################################
        #
        # Get already tax-assigned variants with all informations including sequence
        #
        #######################################################################

        stmt_variant_tax_assign = sqlalchemy.select([
            tax_assign_declarative_table.c.variant_id,
            tax_assign_declarative_table.c.identity,
            tax_assign_declarative_table.c.ltg_rank,
            tax_assign_declarative_table.c.ltg_tax_id,
            tax_assign_declarative_table.c.ltg_tax_name,
            tax_assign_declarative_table.c.blast_db,
            variant_declarative_table.c.sequence,
        ])\
            .where(tax_assign_declarative_table.c.ltg_tax_id.isnot(None))\
            .where(tax_assign_declarative_table.c.variant_id == variant_declarative_table.c.id)\
            .where(variant_declarative_table.c.sequence.in_(variant_sequence_list))\
            .distinct()

        # These are the variants that are already in taxassign and do not need
        # recalculate
        ltg_from_db_list = []
        with engine.connect() as conn:
            for row in conn.execute(stmt_variant_tax_assign).fetchall():
                ltg_from_db_list.append(dict(zip(row.keys(), row.values())))
        """(Pdb) pandas.DataFrame.from_records(ltg_from_db_list)
   identity ltg_rank  ltg_tax_id              ltg_tax_name                                           sequence  variant_id
0       100  species     2028017  Orthocladiinae sp. BAP34  AGCATGATCTGGAATAGTAGGTACTTCCCTTAGTATCTTAATTCGA...         325
1        99  species     2028029   Rheocricotopus sp. DH90  GGCTTGATCCGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA...        1203
2       100  species     1592914            Caenis pusilla  GGCTTGATCCGGAATGCTGGGCACCTCTCTAAGCCTTCTAATTCGT...        1443
3       100  species     2028029   Rheocricotopus sp. DH90  TGCTTGATCAGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA...        2298
4        90   family        7149              Chironomidae  TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGA...        2498
5       100  species      189839            Baetis rhodani  TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGA...        2610"""
        ltg_db_df = pandas.DataFrame.from_records(ltg_from_db_list)
        ltg_db_df = ltg_db_df.reindex(sorted(ltg_db_df.columns),
                                      axis=1)  # sort columns

        #######################################################################
        #
        # Get list of variants (id and sequence) that need blast for tax assignation
        #
        #######################################################################

        stmt_variant = sqlalchemy.select([variant_declarative_table.c.id, variant_declarative_table.c.sequence]) \
            .where(variant_declarative_table.c.sequence.in_(variant_sequence_list)) \

        if ltg_db_df.shape[0] > 0:
            stmt_variant = stmt_variant.where(
                variant_declarative_table.c.id.notin_(
                    ltg_db_df.variant_id.tolist()))
        stmt_variant = stmt_variant.distinct().order_by("id")

        variant_not_tax_assigned = []
        with engine.connect() as conn:
            for row in conn.execute(stmt_variant).fetchall():
                variant_not_tax_assigned.append(
                    dict(zip(row.keys(), row.values())))

        #######################################################################
        #
        # Run RunnerTaxAssign for variant_not_tax_assigned
        #
        #######################################################################

        blast_variant_df = pandas.DataFrame()
        ltg_blast_df = pandas.DataFrame()

        if len(variant_not_tax_assigned
               ) > 0:  # Run blast for variants that need tax assignation

            blast_variant_df = pandas.DataFrame.from_records(
                variant_not_tax_assigned, index='id')
            taxonomy = Taxonomy(tsv=taxonomy_tsv)
            sequence_list = blast_variant_df.sequence.tolist()
            tax_assign_runner = RunnerTaxAssign(sequence_list=sequence_list,
                                                taxonomy=taxonomy,
                                                blast_db_dir=blastdb_dir_path,
                                                blast_db_name=blastdbname_str,
                                                num_threads=num_threads,
                                                params=None)
            ltg_blast_df = tax_assign_runner.ltg_df

            ######################################################
            # Uncomment to debug because blast is slow
            # pandas.to_pickle(ltg_df, "ltg_df.pkl")
            # ltg_df = pandas.read_pickle("ltg_df.pkl")
            # import pdb; pdb.set_trace()
            ######################################################

            ltg_blast_df.rename({'variant_id': 'sequence'},
                                inplace=True,
                                axis=1)

            ltg_blast_df = blast_variant_df.merge(ltg_blast_df,
                                                  on='sequence',
                                                  how='outer')

            ltg_blast_df['blast_db'] = blastdbname_str

            ltg_blast_df = ltg_blast_df.reindex(sorted(ltg_blast_df.columns),
                                                axis=1)  # sort columns
        del (blast_variant_df)

        #######################################################################
        #
        # Concatenate tax-assigned variants from DB and from Blast
        # Merge variant_df and ltg_df and write to DB
        #
        #######################################################################

        if ltg_db_df.shape[0] > 0 and ltg_blast_df.shape[0] > 0:
            ltg_df = pandas.concat([
                ltg_db_df[[
                    "blast_db", "identity", "ltg_rank", "ltg_tax_id",
                    "ltg_tax_name", "sequence"
                ]], ltg_blast_df
            ],
                                   axis=0)
        elif ltg_db_df.shape[0] > 0:
            ltg_df = ltg_db_df.copy()
        elif ltg_blast_df.shape[0] > 0:
            ltg_df = ltg_blast_df.copy()
        del (ltg_blast_df)

        #######################################################################
        #
        # Insert or update variant and taxassign tables
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Insert variant_id, ltg_tax_id, ltg_rank to DB"
            .format(__file__,
                    inspect.currentframe().f_lineno))

        for ltg_row in ltg_df.itertuples():
            variant_sequence = ltg_row.sequence
            with engine.connect() as conn:
                variant_id = conn.execute(
                    sqlalchemy.select([
                        variant_declarative_table.c.id
                    ]).where(variant_declarative_table.c.sequence ==
                             variant_sequence)).first()[0]
                select_row = conn.execute(
                    sqlalchemy.select([
                        TaxAssign
                    ]).where(tax_assign_declarative_table.c.variant_id ==
                             variant_id)).first()
                # import pdb; pdb.set_trace()
                if select_row is None:  # variant_id IS NOT in the database, so INSERT it
                    ltg_row_dic = ltg_row._asdict()
                    ltg_row_dic['variant_id'] = variant_id
                    conn.execute(tax_assign_declarative_table.insert(),
                                 dict(ltg_row_dic))
                else:  # variant_sequence IS in the database, so update row
                    tax_assign_declarative_table.update().where(
                        tax_assign_declarative_table.c.variant_id ==
                        variant_id).values()

        #######################################################################
        #
        # Update LTGs for variant output file
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Update LTGs for variant output file".format(
                __file__,
                inspect.currentframe().f_lineno))

        variant_output_df = variant_input_df.copy()
        del (variant_input_df)
        # Add ltg columns to variant_df if it do not exist
        for ltg_df_col in [
                'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity',
                'blast_db'
        ]:
            if not (ltg_df_col in variant_output_df.columns):
                variant_output_df[ltg_df_col] = None
        # Move sequence column to end
        variant_df_columns = variant_output_df.columns.tolist()
        variant_df_columns.append(
            variant_df_columns.pop(variant_df_columns.index('sequence')))
        variant_output_df = variant_output_df[variant_df_columns]

        for variant_row in variant_output_df.itertuples():
            # variant_id = variant_row.variant_id
            variant_sequence = variant_row.sequence
            with engine.connect() as conn:
                variant_id = conn.execute(
                    sqlalchemy.select([
                        variant_declarative_table.c.id
                    ]).where(variant_declarative_table.c.sequence ==
                             variant_sequence)).first()[0]
                select_row = conn.execute(
                    sqlalchemy.select([
                        TaxAssign.ltg_tax_id,
                        TaxAssign.ltg_tax_name,
                        TaxAssign.ltg_rank,
                        TaxAssign.identity,
                        TaxAssign.blast_db,
                    ]).where(tax_assign_declarative_table.c.variant_id ==
                             variant_id)).first()
            tax_assign_dict = dict(
                zip([
                    'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity',
                    'blast_db'
                ], select_row))
            for k in tax_assign_dict:
                variant_output_df.loc[variant_output_df.sequence ==
                                      variant_sequence, k] = tax_assign_dict[k]
        # do not move. required because sometimes tax_id is none
        variant_output_df = variant_output_df.astype({'ltg_tax_id': 'object'})

        #######################################################################
        #
        # Update tax lineages for variant output file
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Update tax lineages for variant output file".
            format(__file__,
                   inspect.currentframe().f_lineno))

        tax_id_list = variant_output_df.ltg_tax_id.unique().tolist(
        )  # unique list of tax ids
        tax_lineage = TaxLineage(taxonomic_tsv_path=taxonomy_tsv)
        tax_lineage_df = tax_lineage.create_lineage_from_tax_id_list(
            tax_id_list=tax_id_list, tax_name=True)

        # Merge
        variant_output_df = variant_output_df.merge(tax_lineage_df,
                                                    left_on='ltg_tax_id',
                                                    right_on='tax_id',
                                                    how='left')
        variant_output_df.drop('tax_id', axis=1, inplace=True)

        Logger.instance().debug("file: {}; line: {}; Reorder columns".format(
            __file__,
            inspect.currentframe().f_lineno))
        # Move sequence column to end
        variant_df_columns = variant_output_df.columns.tolist()
        variant_df_columns.append(
            variant_df_columns.pop(variant_df_columns.index('sequence')))
        variant_output_df = variant_output_df[variant_df_columns]
        Logger.instance().debug("file: {}; line: {}; Write to TSV".format(
            __file__,
            inspect.currentframe().f_lineno))
        variant_output_df.to_csv(output, sep='\t', index=False, header=True)
Esempio n. 24
0
 def create_denovo_from_ncbi(self):
     new_taxdump_path = self.__download_ncbi_taxonomy_dump()
     #
     Logger.instance().debug(
         "file: {}; line: {}; Extracting NCBI taxonomy dump".format(
             __file__,
             inspect.currentframe().f_lineno))
     if not (os.path.isfile(
             os.path.join(os.path.dirname(new_taxdump_path), "nodes.dmp"))
             and os.path.isfile(
                 os.path.join(os.path.dirname(new_taxdump_path),
                              "names.dmp"))
             and os.path.isfile(
                 os.path.join(os.path.dirname(new_taxdump_path),
                              "merged.dmp"))):
         tar = tarfile.open(new_taxdump_path, "r:gz")
         tar.extractall(path=self.tempdir)
         tar.close()
     Logger.instance().debug(
         "file: {}; line: {}; Reading and processing NCBI taxonomy dump".
         format(__file__,
                inspect.currentframe().f_lineno))
     #
     nodes_dmp = os.path.join(self.tempdir, "nodes.dmp")
     nodes_dmp_df = pandas.read_table(
         nodes_dmp,
         header=None,
         sep='\t',
         engine='python',
         usecols=[0, 2, 4],
         names=['tax_id', 'parent_tax_id', 'rank'])
     #
     names_dmp = os.path.join(self.tempdir, "names.dmp")
     names_dmp_df = pandas.read_table(
         names_dmp,
         header=None,
         sep=r'\t',
         engine='python',
         usecols=[0, 2, 6],
         names=['tax_id', 'name_txt', 'name_class'])
     names_dmp_df = names_dmp_df.loc[names_dmp_df.name_class ==
                                     'scientific name']
     names_dmp_df = names_dmp_df[['tax_id', 'name_txt']]
     #
     taxonomy_df = nodes_dmp_df.merge(names_dmp_df, on='tax_id')
     #
     merged_dmp = os.path.join(self.tempdir, "merged.dmp")
     merged_dmp_df = pandas.read_table(merged_dmp,
                                       header=None,
                                       sep='\t',
                                       engine='python',
                                       usecols=[0, 2],
                                       names=['old_tax_id', 'tax_id'])
     #
     taxonomy_df = taxonomy_df.merge(merged_dmp_df, on='tax_id', how='left')
     #
     Logger.instance().debug("file: {}; line: {}; Write to TSV DB".format(
         __file__,
         inspect.currentframe().f_lineno))
     try:
         taxonomy_df.to_csv(self.taxonomy_tsv_path,
                            sep="\t",
                            header=True,
                            float_format='%.0f',
                            index=False)
     except ValueError as valerr:
         Logger.instance().error(
             VTAMexception(
                 "{}. Error during the creation of the taxonomy DB".format(
                     valerr)))
     except sqlalchemy.exc.OperationalError as opererr:
         Logger.instance().error(
             VTAMexception(
                 "{}. Please, verify the output argument: {}".format(
                     opererr, self.taxonomy_tsv_path)))
Esempio n. 25
0
    def __init__(self, sequence_list, taxonomy, blast_db_dir, blast_db_name,
                 num_threads, params):
        """

        Parameters
        ----------
        sequence_list : list
            List of se
        param2 : str
            The second parameter.

        """

        self.old_tax_id_df = taxonomy.old_tax_df
        self.taxonomy_df = taxonomy.df
        self.blast_db_dir = blast_db_dir
        self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                          os.path.basename(__file__))
        pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True)

        self.num_threads = num_threads

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        params_dic = FileParams(params).get_params_dic()
        qcov_hsp_perc = params_dic['qcov_hsp_perc']

        #######################################################################
        #
        # 2 Create FASTA file with Variants
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Create SortedReadFile from Variants".format(
                __file__,
                inspect.currentframe().f_lineno))
        variant_fasta = os.path.join(self.this_temp_dir, 'variant.fasta')
        with open(variant_fasta, 'w') as fout:
            for seq in sequence_list:
                fout.write(">{}\n{}\n".format(seq, seq))

        #######################################################################
        #
        # 3 Run local blast
        #
        #######################################################################

        runner_blast = RunnerBlast(variant_fasta, blast_db_dir, blast_db_name,
                                   num_threads, qcov_hsp_perc)
        # run blast
        blast_output_tsv = runner_blast.run_local_blast()
        # process blast results
        blast_output_df = RunnerBlast.process_blast_result(blast_output_tsv)

        #######################################################################
        #
        # Compute tax lineages for Blast target tax ids
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Open taxonomy.tsv DB".format(
                __file__,
                inspect.currentframe().f_lineno))
        blast_output_df.target_tax_id = pandas.to_numeric(
            blast_output_df.target_tax_id)
        #
        Logger.instance().debug(
            "file: {}; line: {}; Annotate each target_tax_id with its lineage as columns in wide format"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        tax_id_list = blast_output_df.target_tax_id.unique().tolist()
        tax_id_to_lineage_df = taxonomy.get_several_tax_id_lineages(
            tax_id_list)

        #######################################################################
        #
        # Merge tax lineages and the blast result
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Merge blast result including tax_id with their lineages"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        # Merge local blast output with tax_id_to_lineage_df
        # variant_identity_lineage_df = blast_output_df.merge(
        #     tax_id_to_lineage_df, left_on='target_tax_id', right_on='tax_id')
        variantid_identity_lineage_df = blast_output_df.merge(
            tax_id_to_lineage_df, left_on='target_tax_id', right_index=True)
        # variant_identity_lineage_df.drop('tax_id', axis=1, inplace=True)
        """(Pdb) variant_identity_lineage_df.columns  
Index(['variant_id', 'target_id', 'identity', 'evalue', 'coverage',
       'target_tax_id', 'no rank', 'species', 'genus', 'family', 'order',
       'class', 'subphylum', 'phylum', 'subkingdom', 'kingdom', 'superkingdom',
       'superfamily', 'infraorder', 'suborder', 'infraclass', 'subclass',
       'tribe', 'subfamily', 'cohort', 'subgenus', 'subspecies', 'parvorder',
       'superorder', 'subcohort', 'superclass', 'species group', 'subtribe',
       'section', 'varietas', 'species subgroup'],
      dtype='object')"""

        #######################################################################
        #
        #  several_variants_to_ltg
        # this function returns a data frame containing the Ltg rank and Ltg Tax_id for each variant
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Main loop over variant and identity to"
            "compute the whole set of ltg_tax_id and ltg_rank for each variant_id"
            "to a dataframe".format(__file__,
                                    inspect.currentframe().f_lineno))
        runner_ltg_selection = RunnerLTGselection(
            variant_identity_lineage_df=variantid_identity_lineage_df,
            taxonomy_df=self.taxonomy_df,
            params=params)
        self.ltg_df = runner_ltg_selection.several_variants_to_ltg()
Esempio n. 26
0
    def mark_delete_lfn_per_Ni_or_Nik_or_Njk(
        self,
        lfn_denominator,
        cutoff,
        cutoff_specific_df=None,
    ):
        """

        :param lfn_denominator: string that takes values either: 'N_i', 'N_ik' or 'N_jk'
        :param cutoff: float with general cutoff
        :param cutoff_specific_df: DataFrame with either variant-specific (N_i) or variant-replicate-specific
        deletion cutoff
        :return: None: The output of this filter is added to the 'self.variant_read_count_filter_delete_df'
            with filter_id=2 and 'filter_delete'=1 or 0 (General cutoff)
            and with filter_id=4 and 'filter_delete'=1 or 0 (Variant-specific cutoff)
        """

        if not (cutoff_specific_df is None):
            cutoff_specific_df.drop(['variant_sequence'], axis=1, inplace=True)

        if lfn_denominator == 'N_i':  # variant
            this_filter_id = 2
            N_df = self.variant_read_count_lfn_df.get_N_i_df(
            )  #  Compute N_i_df
            filter_df = self.variant_read_count_df.merge(
                N_df, on=['run_id', 'marker_id', 'variant_id'])
            filter_df['filter_id'] = this_filter_id
            filter_df['cutoff'] = cutoff

            filter_cutoff_specific_df = None
            if not (cutoff_specific_df is None):
                this_filter_id = 4
                filter_cutoff_specific_df = filter_df.copy()
                filter_cutoff_specific_df.drop('cutoff', axis=1, inplace=True)
                filter_cutoff_specific_df = filter_cutoff_specific_df.merge(
                    cutoff_specific_df,
                    on=['run_id', 'marker_id', 'variant_id'])
                filter_cutoff_specific_df['filter_id'] = this_filter_id

            filter_df = pandas.concat([filter_df, filter_cutoff_specific_df],
                                      axis=0)
            filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_i

        elif lfn_denominator == 'N_ik':  # variant_replicate
            this_filter_id = 3
            N_df = self.variant_read_count_lfn_df.get_N_ik_df(
            )  #  Compute N_ik_df
            filter_df = self.variant_read_count_df.merge(
                N_df, on=['run_id', 'marker_id', 'variant_id', 'replicate'])
            filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_ik
            filter_df['filter_id'] = this_filter_id
            filter_df['cutoff'] = cutoff

            filter_cutoff_specific_df = None
            if not (cutoff_specific_df is None):
                this_filter_id = 5
                filter_cutoff_specific_df = filter_df.copy()
                filter_cutoff_specific_df.drop('cutoff', axis=1, inplace=True)
                filter_cutoff_specific_df = filter_cutoff_specific_df.merge(
                    cutoff_specific_df,
                    on=['run_id', 'marker_id', 'variant_id', 'replicate'])
                filter_cutoff_specific_df['filter_id'] = this_filter_id

            filter_df = pandas.concat([filter_df, filter_cutoff_specific_df],
                                      axis=0)
            filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_ik

        elif lfn_denominator == 'N_jk':  # sample_replicate
            this_filter_id = 6
            N_df = self.variant_read_count_lfn_df.get_N_jk_df(
            )  #  Compute N_jk_df
            filter_df = self.variant_read_count_df.merge(
                N_df,
                left_on=['run_id', 'marker_id', 'sample_id', 'replicate'],
                right_on=['run_id', 'marker_id', 'sample_id', 'replicate'])
            filter_df['lfn_ratio'] = filter_df.read_count / filter_df.N_jk
            filter_df['filter_id'] = this_filter_id
            filter_df['cutoff'] = cutoff

        else:
            Logger.instance().critical(
                VTAMexception("Internal error. VTAM will exit."))
            sys.exit(1)

        # Initialize filter: Keep everything
        filter_df['filter_delete'] = False

        # Mark for deletion all variants with read_count=0
        filter_df.loc[filter_df.read_count == 0, 'filter_delete'] = True

        # Mark for deletion all filters with 'lfn_ratio'<=lfn_variant_cutoff
        filter_df.loc[filter_df['lfn_ratio'] <= filter_df['cutoff'],
                      'filter_delete'] = True

        #  Keep important columns
        filter_df = filter_df[[
            'run_id', 'marker_id', 'sample_id', 'replicate', 'variant_id',
            'read_count', 'filter_id', 'filter_delete'
        ]]

        # Prepare output variant_read_count_input_df and concatenate vertically output variant_read_count_input_df
        # to self.variant_read_count_filter_delete_df
        self.variant_read_count_filter_delete_df = pandas.concat(
            [self.variant_read_count_filter_delete_df, filter_df],
            sort=False,
            axis=0)
Esempio n. 27
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        #######################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        #######################################################################
        #
        # Input files
        fasta_info_tsv = self.input_file(
            FilterMinReplicateNumber.__input_file_sortedinfo)
        #
        # Input tables
        input_filter_lfn_model = self.input_table(
            FilterMinReplicateNumber.__input_table_variant_filter_lfn)
        #
        # Options
        min_replicate_number = self.option("min_replicate_number")
        # input_filter_lfn = self.option("input_filter_lfn")
        #
        # Output tables
        output_filter_min_replicate_model = self.output_table(
            FilterMinReplicateNumber.__output_table_filter_min_replicate_number)

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        #######################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine, variant_read_count_like_model=output_filter_min_replicate_model)
        filter_id = None
        if input_filter_lfn_model.__tablename__ == "FilterLFN":
            filter_id = 8  # Variant pass all filters LFN
        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_lfn_model, engine=engine, filter_id=filter_id)

        #######################################################################
        #
        # 4. Run Filter
        #
        #######################################################################

        variant_read_count_delete_df = RunnerFilterMinReplicateNumber(
            variant_read_count_df) .get_variant_read_count_delete_df(min_replicate_number)

        #######################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        #######################################################################

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine, variant_read_count_like_model=output_filter_min_replicate_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(
                id=obj.id).update({'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception(
                    "This filter has deleted all the variants: {}. "
                    "The analysis will stop here.".format(
                        self.__class__.__name__)))
            sys.exit(0)
Esempio n. 28
0
    def main(fastqinfo, fastqdir, fastainfo, fastadir, params=None, num_threads=multiprocessing.cpu_count()):
        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        ############################################################################################
        #
        # Read fastq information into stats_df
        #
        ############################################################################################

        fastqinfo_df = FileSampleInformation(fastqinfo).read_tsv_into_df()

        pathlib.Path(
            os.path.dirname(fastainfo)).mkdir(
            parents=True,
            exist_ok=True)
        pathlib.Path(fastadir).mkdir(parents=True, exist_ok=True)

        fastainfo_df = pandas.DataFrame()

        ############################################################################################
        #
        # Loop over fastq pairs to merge
        #
        ############################################################################################

        # File with analysis stats data
        stats_df = pandas.DataFrame({'FastqFwd': [], 'FastqRev': [], 'NbReadsFwd': [], 'NbReadsRev': [], 'FastaMerged': [], 'NbMergedReads': []})

        for fastqfwd, fastqrev in fastqinfo_df[[
                'fastqfwd', 'fastqrev']].drop_duplicates().values:

            fastq_info_df_i = fastqinfo_df.loc[(fastqinfo_df.fastqfwd == fastqfwd) & (
                fastqinfo_df.fastqrev == fastqrev)]

            fastq_fw_abspath = os.path.join(fastqdir, fastqfwd)
            with open(fastq_fw_abspath, 'rb') as fin:
                fastq_fw_linecount = int(sum(1 for i in fin.read())/4)

            fastq_rv_abspath = os.path.join(fastqdir, fastqrev)
            with open(fastq_rv_abspath, 'rb') as fin:
                fastq_rv_linecount = int(sum(1 for i in fin.read())/4)

            Logger.instance().debug(
                "Analysing FASTQ files: {} and ".format(
                    fastqfwd, fastqrev))

            try:
                pathlib.Path(fastq_fw_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_fw_abspath)))
                sys.exit(1)
            try:
                pathlib.Path(fastq_rv_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_rv_abspath)))
                sys.exit(1)

            fasta_merged_basename = os.path.basename(
                fastq_fw_abspath).replace('.fastq', '.fasta')
            out_fasta_path = os.path.join(fastadir, fasta_merged_basename)

            ########################################################################################
            #
            # Run vsearch merge
            #
            ########################################################################################

            vsearch_args_dic = {}

            vsearch_args_dic['fastq_ascii'] = params_dic['fastq_ascii']
            vsearch_args_dic['fastq_maxee'] = params_dic['fastq_maxee']
            vsearch_args_dic['fastq_maxmergelen'] = params_dic['fastq_maxmergelen']
            vsearch_args_dic['fastq_maxns'] = params_dic['fastq_maxns']
            vsearch_args_dic['fastq_minlen'] = params_dic['fastq_minlen']
            vsearch_args_dic['fastq_minmergelen'] = params_dic['fastq_minmergelen']
            vsearch_args_dic['fastq_minovlen'] = params_dic['fastq_minovlen']
            vsearch_args_dic['fastq_truncqual'] = params_dic['fastq_truncqual']

            vsearch_args_dic['fastq_mergepairs'] = fastq_fw_abspath
            vsearch_args_dic['reverse'] = fastq_rv_abspath
            vsearch_args_dic['fastaout'] = out_fasta_path
            vsearch_args_dic['threads'] = num_threads

            vsearch_cluster = RunnerVSearch(parameters=vsearch_args_dic)
            vsearch_cluster.run()

            fastq_info_df_i = fastq_info_df_i[['run', 'marker', 'sample', 'replicate', 'tagfwd',
                                               'primerfwd', 'tagrev', 'primerrev']]
            fastq_info_df_i['mergedfasta'] = fasta_merged_basename
            fastainfo_df = pandas.concat(
                [fastainfo_df, fastq_info_df_i], axis=0)

            with open(out_fasta_path, 'rb') as fin:
                fasta_merged_linecount = int(sum(1 for i in fin.read()) / 4)

            ########################################################################################
            #
            # Summary file
            #
            ########################################################################################

            stats_df = pandas.concat([stats_df, pandas.DataFrame({
                'FastqFwd': [fastq_fw_abspath], 'FastqRev': [fastq_fw_linecount],
                'NbReadsFwd': [fastq_rv_abspath], 'NbReadsRev': [fastq_rv_linecount], 'FastaMerged': [out_fasta_path], 'NbMergedReads': [fasta_merged_linecount]})])
    
        for mergedfasta in fastainfo_df[['mergedfasta']].drop_duplicates().values:
            mergedfasta = mergedfasta[0]

            if mergedfasta.endswith('.bz2') or  mergedfasta.endswith('.gz'):
                fasta_merged_abspath = os.path.join(fastadir, mergedfasta)
                mergedfasta_compressor = FileCompression(fasta_merged_abspath)
            
                if mergedfasta.endswith('.gz'):
                    mergedfasta_c = mergedfasta_compressor.pigz_compression()
                    if mergedfasta_c is None:
                        mergedfasta_c = mergedfasta_compressor.gzip_compression()

                    
                elif mergedfasta.endswith('.bz2'):
                    mergedfasta_c = mergedfasta_compressor.bz2_compression()
                    
                mergedfasta_compressor.delete_file()
                _, relPath = os.path.split(mergedfasta_c)
                fastainfo_df.loc[fastainfo_df['mergedfasta'] == mergedfasta, 'mergedfasta'] = relPath
                
            else: 
                fastq_info_df_i['mergedfasta'] = fasta_merged_basename

        
        fastainfo_df.to_csv(fastainfo, sep="\t", header=True, index=False)
Esempio n. 29
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        #######################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        #######################################################################

        # Input file output
        fasta_info_tsv = self.input_file(FilterChimera.__input_file_sortedinfo)
        #
        # Input table models
        # Variant = self.input_table(FilterChimera.__input_table_Variant)
        input_filter_pcr_error_model = self.input_table(
            FilterChimera.__input_table_filter_pcr_error)
        #
        # Output table models
        output_filter_chimera_model = self.output_table(
            FilterChimera.__output_table_filter_chimera)
        output_filter_borderline_model = self.output_table(
            FilterChimera.__output_table_filter_chimera_borderline)
        #
        # Params
        uchime3_denovo_abskew = self.option("uchime3_denovo_abskew")

        #######################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        #######################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_chimera_model)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_borderline_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_pcr_error_model,
            engine=engine,
            filter_id=None)

        #######################################################################
        #
        # 4. Run Filter
        #
        #######################################################################

        variant_df = sample_info_tsv_obj.get_variant_df(
            variant_read_count_like_model=input_filter_pcr_error_model,
            engine=engine)
        filter_chimera_runner = RunnerFilterChimera(
            variant_read_count_df=variant_read_count_df)
        filter_output_chimera_df, filter_borderline_output_df = \
            filter_chimera_runner.get_variant_read_count_delete_df(
                variant_df=variant_df, uchime3_denovo_abskew=uchime3_denovo_abskew)

        #######################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        #######################################################################

        DataframeVariantReadCountLike(filter_output_chimera_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_chimera_model)

        DataframeVariantReadCountLike(filter_borderline_output_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_borderline_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()

        if filter_output_chimera_df.filter_delete.sum(
        ) == filter_output_chimera_df.shape[0]:
            Logger.instance().warning(
                VTAMexception("This filter has deleted all the variants: {}. "
                              "The analysis will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)
Esempio n. 30
0
    def __init__(self, sys_argv):

        ############################################################################################
        #
        # Parse arguments
        #
        ############################################################################################

        self.sys_argv = sys_argv
        # AG do not use abspath for the moment. Maybe later it can be used as
        # option
        parser = ArgParser.get_main_arg_parser()
        self.args = parser.parse_args(sys_argv)

        arg_parser_dic = vars(self.args)

        ############################################################################################
        #
        # If non-specified, initiate params.yml
        #
        ############################################################################################

        if 'params' in arg_parser_dic and arg_parser_dic['params'] is None:
            params_yml = os.path.join(PathManager.instance().get_configdir(), "params.yml")
            if not os.path.isfile(params_yml):
                pathlib.Path(params_yml).touch(exist_ok=False)
            arg_parser_dic['params'] = params_yml

        ############################################################################################
        #
        # Parse log arguments
        #
        ############################################################################################

        if 'log_verbosity' in arg_parser_dic:
            (LoggerArguments.instance()).update({'log_verbosity': arg_parser_dic['log_verbosity']})
            os.environ['VTAM_LOG_VERBOSITY'] = str(
                arg_parser_dic['log_verbosity'])

        if 'log' in arg_parser_dic:
            (LoggerArguments.instance()).update({'log': arg_parser_dic['log']})
            os.environ['VTAM_LOG_FILE'] = str(arg_parser_dic['log'])

        #######################################################################
        #
        # Set arguments, logger
        #
        #######################################################################

        # Some arguments will be passed through environmental variables
        if 'threads' in arg_parser_dic:
            os.environ['VTAM_THREADS'] = str(arg_parser_dic['threads'])

        ############################################################################################
        #
        # Subcommands: wopfile-dependent, filter, optimize
        #
        ############################################################################################

        if arg_parser_dic['command'] in ['filter', 'optimize']:

            if arg_parser_dic['command'] in ['filter']:

                ####################################################################################
                #
                # Verify coherence of --lfn_variant_replicate and params arguments
                #
                ####################################################################################

                with open(arg_parser_dic['params']) as fin:
                    # The FullLoader parameter handles the conversion from YAML
                    # scalar values to Python the dictionary format
                    params_dic = yaml.load(fin, Loader=yaml.SafeLoader) or {}

                    if arg_parser_dic['lfn_variant_replicate']:
                        if 'lfn_variant_cutoff' in params_dic:
                            Logger.instance().error(VTAMexception(
                                'The parameter "lfn_variant_cutoff" in the parameter file "{}" is incompatible with'
                                ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params'])))
                            sys.exit(1)

                    else:
                        if 'lfn_variant_replicate_cutoff' in params_dic:
                            Logger.instance().error(VTAMexception(
                                'The parameter "lfn_variant_replicate_cutoff" in the parameter file "{}" needs'
                                ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params'])))
                            sys.exit(1)

                ####################################################################################
                #
                # Verify coherence of --lfn_variant_replicate and cutoff_specific argument
                #
                ####################################################################################

                if not (arg_parser_dic['cutoff_specific'] is None):  # cutoff specific argument

                    if arg_parser_dic['lfn_variant_replicate']:  # lfn_variant_replicate

                        # cutoff_specific for lfn_variant
                        if not FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate():
                            Logger.instance().error('The --lfn_variant_replicate argument is incompatible with the cutoff_specific file {}.'.format(
                                    arg_parser_dic['cutoff_specific']))
                            sys.exit(1)

                    else: # lfn_variant

                        # cutoff_specific for lfn_variant_replicate
                        if FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate():

                            Logger.instance().error('The cutoff_specific file {} requires the --lfn_variant_replicate argument.'.format(
                                    arg_parser_dic['cutoff_specific']))
                            sys.exit(1)

                ############################################################################################
                #
                # If non-specified, initiate cutoff specific
                #
                ############################################################################################

                if arg_parser_dic['cutoff_specific'] is None:
                    cutoff_specific_tsv = os.path.join(PathManager.instance().get_configdir(),
                                                       "cutoff_specific.tsv")
                    if not os.path.isfile(cutoff_specific_tsv):
                        pathlib.Path(cutoff_specific_tsv).touch(exist_ok=False)
                    arg_parser_dic['cutoff_specific'] = cutoff_specific_tsv

            CommandFilterOptimize.main(arg_parser_dic=arg_parser_dic)

        ############################################################################################
        #
        # Subcommand: example
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'example':
            outdir = arg_parser_dic['outdir']
            CommandExample.main(outdir=outdir)

        ############################################################################################
        #
        # Subcommand: merge
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'merge':
            fastqinfo = arg_parser_dic['fastqinfo']
            fastqdir = arg_parser_dic['fastqdir']
            fastainfo = arg_parser_dic['fastainfo']
            fastadir = arg_parser_dic['fastadir']
            num_threads = arg_parser_dic['threads']
            params = arg_parser_dic['params']
            CommandMerge.main(fastqinfo=fastqinfo, fastqdir=fastqdir, fastainfo=fastainfo,
                              fastadir=fastadir, params=params, num_threads=num_threads)

        ############################################################################################
        #
        # Subcommand: sortreads
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'sortreads':
            fastadir = arg_parser_dic['fastadir']
            fastainfo = arg_parser_dic['fastainfo']
            sorteddir = arg_parser_dic['sorteddir']
            num_threads = arg_parser_dic['threads']
            params = arg_parser_dic['params']
            CommandSortReads.main(fastainfo=fastainfo, fastadir=fastadir, params=params,
                                  num_threads=num_threads, sorteddir=sorteddir)

        ############################################################################################
        #
        # Subcommand: taxassign
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'taxassign':
            db = arg_parser_dic['db']
            asvtable_tsv = arg_parser_dic['asvtable']
            output = arg_parser_dic['output']
            mode = arg_parser_dic['mode']
            taxonomy_tsv = arg_parser_dic['taxonomy']
            blasdb_dir_path = arg_parser_dic['blastdbdir']
            blastdbname_str = arg_parser_dic['blastdbname']
            num_threads = arg_parser_dic['threads']
            params = arg_parser_dic['params']
            CommandTaxAssign.main(db=db, mode=mode, asvtable_tsv=asvtable_tsv, output=output,
                                  taxonomy_tsv=taxonomy_tsv, blastdb_dir_path=blasdb_dir_path,
                                  blastdbname_str=blastdbname_str, params=params, num_threads=num_threads)

        ############################################################################################
        #
        # Subcommand: pool
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'pool':
            db = arg_parser_dic['db']
            readcounts = arg_parser_dic['readcounts']
            run_marker_tsv = arg_parser_dic['runmarker']
            pooled_marker_tsv = arg_parser_dic['asvtable']
            params = arg_parser_dic['params']
            CommandPoolRunMarkers.main(db=db, pooled_marker_tsv=pooled_marker_tsv,
                run_marker_tsv=run_marker_tsv, params=params, readcounts=readcounts)

        ############################################################################################
        #
        # Subcommand: taxonomy
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'taxonomy':
            taxonomy_tsv = arg_parser_dic['output']
            precomputed = arg_parser_dic['precomputed']
            taxonomy = CommandTaxonomy(taxonomy_tsv=taxonomy_tsv)
            taxonomy.main(precomputed=precomputed)

        ############################################################################################
        #
        # Subcommand: coi blast
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'coi_blast_db':
            blastdbdir = arg_parser_dic['blastdbdir']
            blastdbname = arg_parser_dic['blastdbname']
            coi_blast_db = CommandBlastCOI(blastdbname=blastdbname)
            coi_blast_db.download(blastdbdir=blastdbdir)

        ############################################################################################
        #
        # Else: run_name usage message
        #
        ############################################################################################

        else:
            self.args = parser.parse_args(['--help'])  # if command unknown print help