def read_peroba_database(f_prefix): if f_prefix[-1] == ".": f_prefix = f_prefix[: -1] ## both `perobaDB.0621` and `perobaDB.0621.` are valid fname = f_prefix + common.suffix["metadata"] logger.info(f"Reading database metadata from \'{fname}\'") metadata = pd.read_csv(fname, compression="infer", index_col="peroba_seq_uid", dtype="unicode") metadata = common.df_finalise_metadata(metadata) fname = f_prefix + common.suffix["tree"] logger.info(f"Reading database tree from \'{fname}\'") treestring = open(fname).readline().rstrip().replace("\'", "").replace( "\"", "").replace("[&R]", "") tree = treeswift.read_tree_newick(treestring) fname = f_prefix + common.suffix["alignment"] logger.info(f"Reading database alignment from \'{fname}\'") sequences = common.read_fasta(fname, check_name=False) logger.info( "Finished loading the database; dataframe has dimensions %s and it's assumed we have the same number of sequences; the tree may be smaller", metadata.shape) return [metadata, sequences, tree]
def update_alignment(self, alignment, seqs_per_block): ref_seq = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/MN908947.3.fas") if alignment is None or len(alignment) < 1: logger.info( f"Aligning all sequences with mafft (no alignment file found)") aln = common.align_sequences_in_blocks( [x for x in self.sequences.values()], reference_file=ref_seq, seqs_per_block=seqs_per_block) return aln aln = dict() # aln is dict but prealign is list for f in alignment: logger.info(f"Reading Alignment file {f}") seqs = common.read_fasta(f, check_name=True) # list aln.update({x.id: x for x in seqs}) # duplicates are overwritten prealign = [x for x in aln.values() if x.id in self.sequences.keys()] prealn_names = [x.id for x in prealign] remain = [ x for x in self.sequences.values() if x.id not in prealn_names ] logger.info( f"From %s sequences, %s were found in alignment (originally with %s sequences)", len(self.sequences), len(prealign), len(aln)) logger.info(f"Aligning remaining sequences with mafft") aln_list = common.align_sequences_in_blocks( remain, reference_file=ref_seq, seqs_per_block=seqs_per_block) return prealign + aln_list
def read_peroba_database(f_prefix, trust_global_sequences=False): if f_prefix[-1] == ".": f_prefix = f_prefix[: -1] ## both `perobaDB.0621` and `perobaDB.0621.` are valid fname = f_prefix + common.suffix["metadata"] logger.info(f"Reading database metadata from \'{fname}\'") metadata = pd.read_csv(fname, compression="infer", index_col="peroba_seq_uid", dtype="unicode") metadata = common.df_finalise_metadata(metadata) fname = f_prefix + common.suffix["subsample"] logger.info(f"Reading subsampling information from \'{fname}\'") subsample = pd.read_csv(fname, compression="infer", index_col="peroba_seq_uid", dtype="unicode") for col in subsample.columns: subsample[col] = pd.to_numeric(subsample[col], errors='coerce') fname = f_prefix + common.suffix["tree"] logger.info(f"Reading database tree from \'{fname}\'") treestring = open(fname).readline().rstrip().replace("\'", "").replace( "\"", "").replace("[&R]", "") tree = treeswift.read_tree_newick(treestring) fname = f_prefix + common.suffix["alignment"] logger.info(f"Reading database alignment from \'{fname}\'") sequences = common.read_fasta(fname, check_name=False) unaligned = [] if trust_global_sequences: logger.info( f"Will assume global sequences are 'better' than local when duplicates exist" ) else: fname = f_prefix + common.suffix["sequences"] logger.info(f"Reading database unaligned sequences from \'{fname}\'") unaligned = common.read_fasta(fname, check_name=False) logger.info( "Finished loading the database; dataframe has dimensions %s and it's assumed we have the same number of sequences; the tree may be smaller", metadata.shape) return [metadata, sequences, tree, subsample, unaligned]
def add_sequences(self, sequence): logger.info( f"Reading fasta sequence files (if not set individually below)") self.sequences = dict() for f in sequence: logger.info(f"Reading sequence file {f}") seqs = common.read_fasta(f, check_name=True) # list self.sequences.update( {x.id: x for x in seqs} ) # dictionary of SeqRecord() (so duplicates are simply overwritten) logger.info("Database now has %s valid sequences", str(len(self.sequences))) self.merge_data_sequence() self.merge_sequence_tree()
def main(): parser = ParserWithErrorHelp( description=""" peroba_backbone is the script that generates a global backbone data set (COGUK+GISAID) given a local one (NORW). It depends on the prefix for a perobaDB set of files (from `peroba_database`), like "perobaDB.0519". It's recommended that you also include local sequences, even without CSV metadata. You can furthermore add a newick file with extra trees (the tree from previous run, for instance). """, usage='''peroba_backbone <perobaDB> [options]''') parser.add_argument('perobaDB') parser.add_argument('-d', '--debug', action="store_const", dest="loglevel", const=logging.DEBUG, default=logging.WARNING, help="Print debugging statements") parser.add_argument('-v', '--verbose', action="store_const", dest="loglevel", const=logging.INFO, help="Add verbosity") parser.add_argument( '-i', '--input', action="store", help="Directory where perobaDB files are. Default: working directory") parser.add_argument('-c', '--csv', metavar='csv', help="csv table with metadata from NORW") parser.add_argument('-s', '--sequences', metavar='fasta', nargs='+', help="extra files with local sequences (from NORW)") parser.add_argument( '-t', '--trees', metavar='', help= "file with (user-defined) trees in newick format to help produce backbone" ) parser.add_argument( '-o', '--output', action="store", help="Output database directory. Default: working directory") parser.add_argument( '-g', '--global_level', metavar='[0,1,2]', type=int, default=0, help= "how broad the search should be (default=0 wich means local (COGUK) new samples only)" ) parser.add_argument( '-f', '--fast', default=False, action='store_true', help= "Fast mode (known NORW samples are added to backbone and not to query)" ) parser.add_argument( '-r', '--trust', default=False, action='store_true', help="Trust global sequences, skipping quality comparison for matches") args = parser.parse_args() logging.basicConfig(level=args.loglevel) if args.output: output_d = os.path.join(current_working_dir, args.output) common.pathlib.Path(output_d).mkdir( parents=True, exist_ok=True) # python 3.5+ create dir if it doesn't exist else: output_d = current_working_dir prefix = os.path.join( output_d, "peroba_backbone." + datetime.datetime.now().strftime("%m%d_%H%M") + ".") if args.input: input_d = os.path.join(current_working_dir, args.input) else: input_d = current_working_dir logger.info("Reading metadata, sequences, and tree from peroba_database") database = read_peroba_database( os.path.join(input_d, args.perobaDB), args.trust) # something like "my_folder/perobaDB.0515" csv = None if (args.csv): fname = os.path.join(current_working_dir, args.csv) if not os.path.exists(fname): fname = os.path.join(input_d, args.csv) if not os.path.exists(fname): logger.warning( f"Could not find local CSV file {args.csv}; Will proceed without it" ) else: logger.info("Reading CSV file with metadata from NORW") csv = common.df_read_genome_metadata( fname, index_name="central_sample_id") csv = common.df_finalise_metadata(csv) sequences = None if (args.sequences): logger.info("Reading fasta files with sequences from NORW") if isinstance(args.sequences, str): seqfiles = [args.sequences] else: seqfiles = args.sequences sequences = [] for f in seqfiles: fname = os.path.join(input_d, f) if not os.path.exists(fname): fname = os.path.join(current_working_dir, f) if not os.path.exists(fname): logger.warning( f"Could not find sequence file {f}; Will proceed without it" ) else: s = common.read_fasta(fname, check_name=False) sequences += s trees = None if (args.trees): fname = os.path.join(current_working_dir, args.trees) if not os.path.exists(fname): fname = os.path.join(input_d, args.trees) if not os.path.exists(fname): logger.warning( f"Could not find tree file {args.trees}; Will proceed without it" ) else: logger.info( "Reading file with current trees and checking for duplicate names" ) treestring = [ x.rstrip().replace("\'", "").replace("\"", "").replace("[&R]", "") for x in open(fname) ] trees = [] for i, trs in enumerate( treestring): ## I use ete3 to remove duplicate leaves tre = ete3.Tree(trs) tree_length = len([leaf.name for leaf in tre.iter_leaves()]) tree_leaves = { str(leaf.name): leaf for leaf in tre.iter_leaves() } # dup leaves will simply overwrite node information if (tree_length > len(tree_leaves)): tre.prune([node for node in tree_leaves.values()], preserve_branch_length=True ) # or leafnames, but fails on duplicates logger.warning( f"Found duplicated leaf names in input treefile {i}, will keep one at random" ) logger.info("%s leaves in treefile %s", len(tree_leaves), str(i)) trees.append(tre) main_generate_backbone_dataset(database, csv, sequences, trees, prefix, args.global_level, args.fast)