def parse_busco_full_summary(busco_file, chunk=100000): """Parse a busco full summary file.""" logger.info("Parsing BUSCO full summary file") locations = defaultdict(list) with tofile.open_file_handle(busco_file) as fh: for line in fh: if line.startswith("#"): continue parts = line.split("\t") if parts[1] in ("Complete", "Duplicated"): locations[parts[2]].append((int(parts[3]), int(parts[4]))) windows = {} for title, tuples in locations.items(): tuples.sort(key=lambda tup: tup[0]) windows[title] = [] start_index = 0 for location in tuples: windows[title].append([location[0], location[0] + chunk, 0]) for window in windows[title][start_index:]: if location[1] < window[1]: window[2] += 1 else: start_index += 1 windows[title].sort(key=lambda window: window[2], reverse=True) return windows
def parse_full_tsv(filename): """Parse a TSV file containing one value per sequence.""" values = defaultdict(dict) sd = defaultdict(dict) n = defaultdict(dict) header = None with tofile.open_file_handle(filename) as fh: for line in fh: row = line.strip().split("\t") if header is None: header = {key: idx + 3 for idx, key in enumerate(row[3:])} continue length = int(row[2]) - int(row[1]) values["length"][row[0]] = length values["position"][row[0]] = int(row[2]) for key, idx in header.items(): if key.endswith("_sd"): sd[key[:key.rfind("_sd")]][row[0]] = float(row[idx]) elif key.endswith("_n"): n[key[:key.rfind("_n")]][row[0]] = float(row[idx]) elif key.endswith("_cpm"): values[key][row[0]] = float( "%.3g" % (float(row[idx]) / length * 1000000)) elif key.endswith("_count"): values[key][row[0]] = int(row[idx]) else: values[key][row[0]] = float(row[idx]) return values, sd, n
def parse_windowed_tsv(filename, window): """Parse a TSV file containing one value per sequence.""" values = defaultdict(lambda: defaultdict(list)) sd = defaultdict(lambda: defaultdict(list)) n = defaultdict(lambda: defaultdict(list)) lengths = {} header = None with tofile.open_file_handle(filename) as fh: for line in fh: row = line.strip().split("\t") if header is None: header = {key: idx + 3 for idx, key in enumerate(row[3:])} continue length = int(row[2]) - int(row[1]) if float(window) > 1: prev_len = lengths.get(row[0], 0) if length < prev_len: continue lengths[row[0]] = length values["length"][row[0]].append(length) values["position"][row[0]].append(round(int(row[1]) + length / 2)) for key, idx in header.items(): if key.endswith("_sd"): sd[key[:key.rfind("_sd")]][row[0]].append(float(row[idx])) elif key.endswith("_n"): n[key[:key.rfind("_n")]][row[0]].append(float(row[idx])) elif key.endswith("_cpm"): values[key][row[0]].append( float("%.3g" % (float(row[idx]) / length * 1000000))) elif key.endswith("_count"): values[key][row[0]].append(int(row[idx])) else: values[key][row[0]].append(float(row[idx])) return values, sd, n
def parse_busco_summary(filename, mask, header): """Parse chunked values into dict.""" lineage = None with tofile.open_file_handle(filename) as fh: buscos = defaultdict(list) for line in fh.readlines(): if line.startswith("#"): if line.startswith("# The lineage dataset is:"): meta = line.split() lineage = meta[5] header.append("%s_count" % lineage) continue busco, status, *rest = line.rstrip().split("\t") if status in {"Fragmented", "Missing"}: continue seqid, start, *rest = rest buscos[seqid].append(int(start)) if lineage is not None: for seqid in mask: starts = sorted(buscos[seqid]) i = 0 for start, obj in mask[seqid].items(): ctr = 0 while i < len(starts): if starts[i] >= start: if starts[i] > obj["end"]: break ctr += 1 i += 1 obj["cols"].append(ctr) return mask, header
def parse_full_bed(filename): """Parse a BED file containing one value per sequence.""" parsed = {} with tofile.open_file_handle(filename) as fh: for line in fh: row = line.strip().split("\t") if len(row) < 5: return parsed parsed[row[0]] = float(row[4]) return parsed
def parse_window_bed(filename): """Parse a BED file containing multiple values per sequence.""" parsed = defaultdict(list) with tofile.open_file_handle(filename) as fh: for line in fh: row = line.strip().split("\t") parsed[row[0]].append((int(row[1]), float(row[4]))) windowed = {} for seq_id, arr in parsed.items(): windowed[seq_id] = [tup[1] for tup in sorted(arr, key=lambda tup: tup[0])] return windowed
def load_mask(filename): """Load bed file as mask.""" mask = defaultdict(dict) header = [] with tofile.open_file_handle(filename) as fh: for line in fh.readlines(): seqid, start, end, *cols = line.rstrip().split("\t") if cols is None: cols = [] if seqid == "sequence" and start == "start": header = cols continue mask[seqid].update({int(start): {"end": int(end), "cols": cols}}) return mask, header
def parse_busco_summary(filename, mask, header): """Parse chunked values into dict.""" lineage = None with tofile.open_file_handle(filename) as fh: buscos = defaultdict(list) for line in fh.readlines(): if line.startswith("#"): if line.startswith("# The lineage dataset is:"): meta = line.split() lineage = meta[5] header.append("%s_count" % lineage) continue busco, status, *rest = line.rstrip().split("\t") if status in {"Fragmented", "Missing"}: continue seqid, start, *rest = rest buscos[seqid].append(int(start)) if lineage is not None: for seqid in mask: starts = sorted(buscos[seqid]) i = 0 for start, obj in mask[seqid].items(): ctr = 0 while i < len(starts): if starts[i] >= start: if starts[i] > obj["end"]: break ctr += 1 i += 1 obj["cols"].append(ctr) # if header is None: # header = {key: idx + 3 for idx, key in enumerate(row[3:])} # continue # seqid = row[0] # chunk_length = int(row[2]) - int(row[1]) # if chunk_length > interval: # interval = chunk_length # lengths[seqid] += chunk_length # for key, idx in header.items(): # values[seqid][key].append(float(row[idx])) return mask, header
def parse_chunked_values(filename): """Parse chunked values into dict.""" interval = 0 header = None values = defaultdict(lambda: defaultdict(list)) lengths = defaultdict(int) with tofile.open_file_handle(filename) as fh: for line in fh.readlines(): row = line.rstrip().split("\t") if header is None: header = {key: idx + 3 for idx, key in enumerate(row[3:])} continue seqid = row[0] chunk_length = int(row[2]) - int(row[1]) if chunk_length > interval: interval = chunk_length lengths[seqid] += chunk_length for key, idx in header.items(): values[seqid][key].append(float(row[idx])) return lengths, values, interval
def parse_assembly_report(filename, cat_filename, syn_filename): """Parse synonyms and assembly level into tsv files.""" synonyms = [] categories = [] cats = { "identifier": { "index": 4, "list": [] }, "assembly_role": { "index": 1, "list": [] }, "assembly_level": { "index": 3, "list": [] }, "assembly_unit": { "index": 7, "list": [] }, } names = { "identifier": { "index": 4, "list": [] }, "name": { "index": 0, "list": [] }, "assigned_name": { "index": 2, "list": [] }, "refseq_accession": { "index": 6, "list": [] }, } with tofile.open_file_handle(filename) as fh: for line in fh: if line.startswith("#"): continue row = line.rstrip().split("\t") for group in (cats, names): for obj in group.values(): value = row[obj["index"]] obj["list"].append(value) header = [] for key, obj in cats.items(): if len(set(obj["list"])) > 1: header.append(key) categories.append(header) for idx, value in enumerate(cats[header[0]]["list"]): row = [value] for key in header[1:]: row.append(cats[key]["list"][idx]) categories.append(row) tofile.write_file(cat_filename, categories) header = [] for key, obj in names.items(): if len(set(obj["list"])) > 1: header.append(key) synonyms.append(header) for idx, value in enumerate(names[header[0]]["list"]): row = [value] for key in header[1:]: row.append(names[key]["list"][idx]) synonyms.append(row) tofile.write_file(syn_filename, synonyms)