Ejemplo n.º 1
0
def read_peroba_database(f_prefix):
    if f_prefix[-1] == ".":
        f_prefix = f_prefix[:
                            -1]  ## both `perobaDB.0621` and `perobaDB.0621.` are valid
    fname = f_prefix + common.suffix["metadata"]
    logger.info(f"Reading database metadata from \'{fname}\'")
    metadata = pd.read_csv(fname,
                           compression="infer",
                           index_col="peroba_seq_uid",
                           dtype="unicode")
    metadata = common.df_finalise_metadata(metadata)

    fname = f_prefix + common.suffix["tree"]
    logger.info(f"Reading database tree from \'{fname}\'")
    treestring = open(fname).readline().rstrip().replace("\'", "").replace(
        "\"", "").replace("[&R]", "")
    tree = treeswift.read_tree_newick(treestring)

    fname = f_prefix + common.suffix["alignment"]
    logger.info(f"Reading database alignment from \'{fname}\'")
    sequences = common.read_fasta(fname, check_name=False)

    logger.info(
        "Finished loading the database; dataframe has dimensions %s and it's assumed we have the same number of sequences; the tree may be smaller",
        metadata.shape)
    return [metadata, sequences, tree]
    def update_alignment(self, alignment, seqs_per_block):
        ref_seq = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               "data/MN908947.3.fas")
        if alignment is None or len(alignment) < 1:
            logger.info(
                f"Aligning all sequences with mafft (no alignment file found)")
            aln = common.align_sequences_in_blocks(
                [x for x in self.sequences.values()],
                reference_file=ref_seq,
                seqs_per_block=seqs_per_block)
            return aln

        aln = dict()  # aln is dict but prealign is list
        for f in alignment:
            logger.info(f"Reading Alignment file {f}")
            seqs = common.read_fasta(f, check_name=True)  # list
            aln.update({x.id: x for x in seqs})  #  duplicates are overwritten

        prealign = [x for x in aln.values() if x.id in self.sequences.keys()]
        prealn_names = [x.id for x in prealign]
        remain = [
            x for x in self.sequences.values() if x.id not in prealn_names
        ]
        logger.info(
            f"From %s sequences, %s were found in alignment (originally with %s sequences)",
            len(self.sequences), len(prealign), len(aln))
        logger.info(f"Aligning remaining sequences with mafft")
        aln_list = common.align_sequences_in_blocks(
            remain, reference_file=ref_seq, seqs_per_block=seqs_per_block)
        return prealign + aln_list
Ejemplo n.º 3
0
def read_peroba_database(f_prefix, trust_global_sequences=False):
    if f_prefix[-1] == ".":
        f_prefix = f_prefix[:
                            -1]  ## both `perobaDB.0621` and `perobaDB.0621.` are valid
    fname = f_prefix + common.suffix["metadata"]
    logger.info(f"Reading database metadata from \'{fname}\'")
    metadata = pd.read_csv(fname,
                           compression="infer",
                           index_col="peroba_seq_uid",
                           dtype="unicode")
    metadata = common.df_finalise_metadata(metadata)

    fname = f_prefix + common.suffix["subsample"]
    logger.info(f"Reading subsampling information from \'{fname}\'")
    subsample = pd.read_csv(fname,
                            compression="infer",
                            index_col="peroba_seq_uid",
                            dtype="unicode")
    for col in subsample.columns:
        subsample[col] = pd.to_numeric(subsample[col], errors='coerce')

    fname = f_prefix + common.suffix["tree"]
    logger.info(f"Reading database tree from \'{fname}\'")
    treestring = open(fname).readline().rstrip().replace("\'", "").replace(
        "\"", "").replace("[&R]", "")
    tree = treeswift.read_tree_newick(treestring)

    fname = f_prefix + common.suffix["alignment"]
    logger.info(f"Reading database alignment from \'{fname}\'")
    sequences = common.read_fasta(fname, check_name=False)

    unaligned = []
    if trust_global_sequences:
        logger.info(
            f"Will assume global sequences are 'better' than local when duplicates exist"
        )
    else:
        fname = f_prefix + common.suffix["sequences"]
        logger.info(f"Reading database unaligned sequences from \'{fname}\'")
        unaligned = common.read_fasta(fname, check_name=False)

    logger.info(
        "Finished loading the database; dataframe has dimensions %s and it's assumed we have the same number of sequences; the tree may be smaller",
        metadata.shape)
    return [metadata, sequences, tree, subsample, unaligned]
    def add_sequences(self, sequence):
        logger.info(
            f"Reading fasta sequence files (if not set individually below)")
        self.sequences = dict()
        for f in sequence:
            logger.info(f"Reading sequence file {f}")
            seqs = common.read_fasta(f, check_name=True)  # list
            self.sequences.update(
                {x.id: x
                 for x in seqs}
            )  # dictionary of SeqRecord() (so duplicates are simply overwritten)

        logger.info("Database now has %s valid sequences",
                    str(len(self.sequences)))
        self.merge_data_sequence()
        self.merge_sequence_tree()
Ejemplo n.º 5
0
def main():
    parser = ParserWithErrorHelp(
        description="""
    peroba_backbone is the script that generates a global backbone data set (COGUK+GISAID) given a local one (NORW).
    It depends on the prefix for a perobaDB set of files (from `peroba_database`), like "perobaDB.0519".
    It's recommended that you also include local sequences, even without CSV metadata. You can furthermore add a newick file with extra 
    trees (the tree from previous run, for instance).
    """,
        usage='''peroba_backbone <perobaDB> [options]''')

    parser.add_argument('perobaDB')
    parser.add_argument('-d',
                        '--debug',
                        action="store_const",
                        dest="loglevel",
                        const=logging.DEBUG,
                        default=logging.WARNING,
                        help="Print debugging statements")
    parser.add_argument('-v',
                        '--verbose',
                        action="store_const",
                        dest="loglevel",
                        const=logging.INFO,
                        help="Add verbosity")
    parser.add_argument(
        '-i',
        '--input',
        action="store",
        help="Directory where perobaDB files are. Default: working directory")
    parser.add_argument('-c',
                        '--csv',
                        metavar='csv',
                        help="csv table with metadata from NORW")
    parser.add_argument('-s',
                        '--sequences',
                        metavar='fasta',
                        nargs='+',
                        help="extra files with local sequences (from NORW)")
    parser.add_argument(
        '-t',
        '--trees',
        metavar='',
        help=
        "file with (user-defined) trees in newick format to help produce backbone"
    )
    parser.add_argument(
        '-o',
        '--output',
        action="store",
        help="Output database directory. Default: working directory")
    parser.add_argument(
        '-g',
        '--global_level',
        metavar='[0,1,2]',
        type=int,
        default=0,
        help=
        "how broad the search should be (default=0 wich means local (COGUK) new samples only)"
    )
    parser.add_argument(
        '-f',
        '--fast',
        default=False,
        action='store_true',
        help=
        "Fast mode (known NORW samples are added to backbone and not to query)"
    )
    parser.add_argument(
        '-r',
        '--trust',
        default=False,
        action='store_true',
        help="Trust global sequences, skipping quality comparison for matches")

    args = parser.parse_args()
    logging.basicConfig(level=args.loglevel)
    if args.output:
        output_d = os.path.join(current_working_dir, args.output)
        common.pathlib.Path(output_d).mkdir(
            parents=True,
            exist_ok=True)  # python 3.5+ create dir if it doesn't exist
    else:
        output_d = current_working_dir
    prefix = os.path.join(
        output_d, "peroba_backbone." +
        datetime.datetime.now().strftime("%m%d_%H%M") + ".")

    if args.input: input_d = os.path.join(current_working_dir, args.input)
    else: input_d = current_working_dir

    logger.info("Reading metadata, sequences, and tree from peroba_database")
    database = read_peroba_database(
        os.path.join(input_d, args.perobaDB),
        args.trust)  # something like "my_folder/perobaDB.0515"

    csv = None
    if (args.csv):
        fname = os.path.join(current_working_dir, args.csv)
        if not os.path.exists(fname):
            fname = os.path.join(input_d, args.csv)
        if not os.path.exists(fname):
            logger.warning(
                f"Could not find local CSV file {args.csv}; Will proceed without it"
            )
        else:
            logger.info("Reading CSV file with metadata from NORW")
            csv = common.df_read_genome_metadata(
                fname, index_name="central_sample_id")
            csv = common.df_finalise_metadata(csv)

    sequences = None
    if (args.sequences):
        logger.info("Reading fasta files with sequences from NORW")
        if isinstance(args.sequences, str):
            seqfiles = [args.sequences]
        else:
            seqfiles = args.sequences
        sequences = []
        for f in seqfiles:
            fname = os.path.join(input_d, f)
            if not os.path.exists(fname):
                fname = os.path.join(current_working_dir, f)
            if not os.path.exists(fname):
                logger.warning(
                    f"Could not find sequence file {f}; Will proceed without it"
                )
            else:
                s = common.read_fasta(fname, check_name=False)
                sequences += s

    trees = None
    if (args.trees):
        fname = os.path.join(current_working_dir, args.trees)
        if not os.path.exists(fname):
            fname = os.path.join(input_d, args.trees)
        if not os.path.exists(fname):
            logger.warning(
                f"Could not find tree file {args.trees}; Will proceed without it"
            )
        else:
            logger.info(
                "Reading file with current trees and checking for duplicate names"
            )
            treestring = [
                x.rstrip().replace("\'", "").replace("\"",
                                                     "").replace("[&R]", "")
                for x in open(fname)
            ]
            trees = []
            for i, trs in enumerate(
                    treestring):  ## I use ete3 to remove duplicate leaves
                tre = ete3.Tree(trs)
                tree_length = len([leaf.name for leaf in tre.iter_leaves()])
                tree_leaves = {
                    str(leaf.name): leaf
                    for leaf in tre.iter_leaves()
                }  # dup leaves will simply overwrite node information
                if (tree_length > len(tree_leaves)):
                    tre.prune([node for node in tree_leaves.values()],
                              preserve_branch_length=True
                              )  # or leafnames, but fails on duplicates
                    logger.warning(
                        f"Found duplicated leaf names in input treefile {i}, will keep one at random"
                    )
                logger.info("%s leaves in treefile %s", len(tree_leaves),
                            str(i))
                trees.append(tre)

    main_generate_backbone_dataset(database, csv, sequences, trees, prefix,
                                   args.global_level, args.fast)