def open_input_files(in_filename): if not os.path.exists(in_filename) or not os.path.isfile(in_filename): sys.stderr.write("input file %s does not exist or is not a regular file\n" % in_filename) exit(2) # read file that contains list of input files in_file = open(in_filename) infiles = [] for line in in_file: # open each input file and read first line filename = line.rstrip() if not filename or not os.path.exists(filename) or not os.path.isfile(filename): sys.stderr.write("input file '%s' does not exist or is not a regular file\n" % line) exit(2) if util.is_gzipped(filename): f = gzip.open(filename, "rt") else: f = open(filename) # skip header f.readline() infiles.append(f) in_file.close() if len(infiles) == 0: sys.stderr.write("no input files specified in file '%s'\n" % options.infile_list) exit(2) return infiles
def open_input_files(in_filename): if not os.path.exists(in_filename) or not os.path.isfile(in_filename): raise IOError("input file %s does not exist or is not a " "regular file\n" % in_filename) # read file that contains list of input files in_file = open(in_filename, "rt") infiles = [] for line in in_file: # open each input file and read first line filename = line.rstrip() sys.stderr.write(" " + filename + "\n") if (not filename) or (not os.path.exists(filename)) or \ (not os.path.isfile(filename)): sys.stderr.write("input file '%s' does not exist or is not a " "regular file\n" % in_file) exit(2) if util.is_gzipped(filename): f = gzip.open(filename, "rt") else: f = open(filename, "rt") # skip header f.readline() infiles.append(f) in_file.close() if len(infiles) == 0: sys.stderr.write("no input files specified in file '%s'\n" % in_filename) exit(2) return infiles
def read_file(self, filename): """read in SNPs and indels from text input file""" try: if util.is_gzipped(filename): f = gzip.open(filename, "rt") else: f = open(filename, "rt") except IOError: sys.stderr.write("WARNING: unable to read from file '%s', " "assuming no SNPs for this chromosome\n" % filename) self.clear() return snp_pos_list = [] snp_allele1_list = [] snp_allele2_list = [] max_pos = 0 for line in f: words = line.split() if(len(words) < 3): raise ValueError("expected at least 3 values per SNP " "file line but got %d:\n" "%s\n" % (len(words), line)) pos = int(words[0]) a1 = words[1].upper().replace("-", "") a2 = words[2].upper().replace("-", "") if pos <= 0: raise ValueError("expected SNP position to be >= 1:\n%s\n" % line) if pos > max_pos: max_pos = pos snp_pos_list.append(pos) snp_allele1_list.append(a1) snp_allele2_list.append(a2) f.close() # convert lists to numpy arrays, which allow for faster # lookups and use less memory self.snp_pos = np.array(snp_pos_list, dtype=np.int32) del snp_pos_list self.snp_allele1 = np.array(snp_allele1_list, dtype="|S10") del snp_allele1_list self.snp_allele2 = np.array(snp_allele2_list, dtype="|S10") del snp_allele2_list # make another array that makes it easy to lookup SNPs by their position # on the chromosome self.snp_index = np.empty(max_pos, dtype=np.int32) self.snp_index[:] = SNP_UNDEF self.snp_index[self.snp_pos-1] = np.arange(self.snp_pos.shape[0]) self.n_snp = self.snp_pos.shape[0]
def get_all_chromosomes(filename): if util.is_gzipped(filename): f = gzip.open(filename, "rt") else: f = open(filename, "rt") chrom_list = [] for line in f: words = line.rstrip().split() if len(words) < 2: raise ValueError("expected at least two columns per line\n") chrom = Chromosome(name=words[0], length=int(words[1])) chrom_list.append(chrom) lc_name = chrom.name.lower() # determine whether this is autosome, sex or mitochondrial chrom if re.match('^chr(\d+)', lc_name): chrom.is_auto = True elif re.match("^chr[W-Zw-z]", lc_name): chrom.is_sex = True elif lc_name.startswith("chrm"): chrom.is_mito = True elif lc_name.startswith("chrun") or lc_name.startswith("chrur"): chrom.is_rand = True else: sys.stderr.write("WARNING: could not determine chromosome type " "(autosome, sex, mitochondrial) from name " "'%s'. Assuming 'random'\n" % chrom.name) chrom.is_rand = True if "rand" in chrom.name: # random chromosome chrom.is_rand = True if "hap" in chrom.name: # alt haplotype chromosome chrom.is_hap = True chrom_list.sort(key=Chromosome.key) idnum = 1 for chrom in chrom_list: chrom.idnum = idnum idnum += 1 f.close() return chrom_list
def get_all_chromosomes(filename): if util.is_gzipped(filename): f = gzip.open(filename, "rt") else: f = open(filename) chrom_list = [] for line in f: words = line.rstrip().split() if len(words) < 2: raise ValueError("expected at least two columns per line\n") chrom = Chromosome(name=words[0], length=int(words[1])) chrom_list.append(chrom) lc_name = chrom.name.lower() # determine whether this is autosome, sex or mitochondrial chrom if re.match('^chr(\d+)', lc_name): chrom.is_auto=True elif re.match("^chr[W-Zw-z]", lc_name): chrom.is_sex = True elif lc_name.startswith("chrm"): chrom.is_mito = True elif lc_name.startswith("chrun") or lc_name.startswith("chrur"): chrom.is_rand = True else: sys.stderr.write("WARNING: could not determine chromosome type " "(autosome, sex, mitochondrial) from name " "'%s'. Assuming 'random'\n" % chrom.name) chrom.is_rand = True if "rand" in chrom.name: # random chromosome chrom.is_rand = True if "hap" in chrom.name: # alt haplotype chromosome chrom.is_hap = True chrom_list.sort(key=Chromosome.key) idnum = 1 for chrom in chrom_list: chrom.idnum = idnum idnum += 1 f.close() return chrom_list
def open_files(file_list, r_w): files = [] for filename in file_list: if r_w.startswith("r"): if util.is_gzipped(filename): files.append(gzip.open(filename, r_w)) else: files.append(open(filename, r_w)) else: if filename.endswith(".gz"): files.append(gzip.open(filename, r_w)) else: files.append(open(filename, r_w)) return (files)
def open_files(file_list, r_w): files=[] for filename in file_list: if r_w.startswith("r"): if util.is_gzipped(filename): files.append(gzip.open(filename, r_w)) else: files.append(open(filename, r_w)) else: if filename.endswith(".gz"): files.append(gzip.open(filename, r_w)) else: files.append(open(filename, r_w)) return(files)
def parse_samples(samples_str): """Gets list of samples from --samples argument. This may be a comma-delimited string or a path to a file. If a file is provided then the first column of the file is assumed to be the sample name""" if samples_str is None: return None # first check if this is a path to a file if os.path.exists(samples_str) and not os.path.isdir(samples_str): samples = [] if util.is_gzipped(samples_str): f = gzip.open(samples_str) else: f = open(samples_str) for line in f: # assume first token in line is sample name samples.append(line.split()[0]) sys.stderr.write("read %d sample names from file '%s'\n" % (len(samples), samples_str)) f.close() else: # otherwise assume comma-delimited string if ("," not in samples_str and len(samples_str) > 15) \ or ("/" in samples_str): sys.stderr.write("WARNING: --samples argument (%s) " "does not look like sample name " "but is not path to valid file. " "Assuming it is a sample name anyway." % samples_str) samples = samples_str.split(",") sys.stderr.write("SAMPLES: %s\n"% repr(samples)) return samples
def parse_samples(samples_str): """Gets list of samples from --samples argument. This may be a comma-delimited string or a path to a file. If a file is provided then the first column of the file is assumed to be the sample name""" if samples_str is None: return None # first check if this is a path to a file if os.path.exists(samples_str) and not os.path.isdir(samples_str): samples = [] if util.is_gzipped(samples_str): f = gzip.open(samples_str, "rt") else: f = open(samples_str, "rt") for line in f: # assume first token in line is sample name samples.append(line.split()[0]) sys.stderr.write("read %d sample names from file '%s'\n" % (len(samples), samples_str)) f.close() else: # otherwise assume comma-delimited string if ("," not in samples_str and len(samples_str) > 15) \ or ("/" in samples_str): sys.stderr.write("WARNING: --samples argument (%s) " "does not look like sample name " "but is not path to valid file. " "Assuming it is a sample name anyway." % samples_str) samples = samples_str.split(",") sys.stderr.write("SAMPLES: %s\n"% repr(samples)) return samples
def main(): error = 0.01 args = parse_options() if util.is_gzipped(args.infile): infile = gzip.open(args.infile, "rt") else: infile = open(args.infile, "r") if args.outfile.endswith(".gz"): outfile = gzip.open(args.outfile,"w") else: outfile = open(args.outfile,"w") ref_count_h5 = tables.openFile(args.ref_as_counts) alt_count_h5 = tables.openFile(args.alt_as_counts) snp_line = infile.readline() if snp_line: outfile.write(snp_line) else: sys.stderr.write("The input file was empty.\n") exit(-1) snp_line = infile.readline() while snp_line: snpinfo = snp_line.strip().split() if snpinfo[9] == "NA": outfile.write(snp_line) else: new_hetps = process_one_snp(snpinfo, ref_count_h5, alt_count_h5, error) outfile.write("\t".join(snpinfo[:10] + [";".join(new_hetps)] + snpinfo[11:]) + "\n") snp_line = infile.readline() ref_count_h5.close() alt_count_h5.close()
def main(): error = 0.01 args = parse_options() if util.is_gzipped(args.infile): infile = gzip.open(args.infile, "rt") else: infile = open(args.infile, "rt") if args.outfile.endswith(".gz"): outfile = gzip.open(args.outfile, "wt") else: outfile = open(args.outfile, "wt") ref_count_h5 = tables.open_file(args.ref_as_counts) alt_count_h5 = tables.open_file(args.alt_as_counts) snp_line = infile.readline() if snp_line: outfile.write(snp_line) else: sys.stderr.write("The input file was empty.\n") exit(-1) snp_line = infile.readline() while snp_line: snpinfo = snp_line.strip().split() if snpinfo[9] == "NA": outfile.write(snp_line) else: new_hetps = process_one_snp(snpinfo, ref_count_h5, alt_count_h5, error) outfile.write("\t".join(snpinfo[:10] + [";".join(new_hetps)] + snpinfo[11:]) + "\n") snp_line = infile.readline() ref_count_h5.close() alt_count_h5.close()
def read_file(self, filename): """read in SNPs and indels from text input file""" try: if util.is_gzipped(filename): f = gzip.open(filename, "rt") else: f = open(filename, "rt") except IOError: sys.stderr.write("WARNING: unable to read from file '%s', " "assuming no SNPs for this chromosome\n" % filename) self.clear() return snp_pos_list = [] snp_allele1_list = [] snp_allele2_list = [] max_pos = 0 for line in f: words = line.split() if(len(words) < 3): raise ValueError("expected at least 3 values per SNP " "file line but got %d:\n" "%s\n" % (len(words), line)) pos = int(words[0]) a1 = words[1].upper().replace("-", "") a2 = words[2].upper().replace("-", "") if pos <= 0: raise ValueError("expected SNP position to be >= 1:\n%s\n" % line) if pos > max_pos: max_pos = pos snp_pos_list.append(pos) snp_allele1_list.append(a1) snp_allele2_list.append(a2) f.close() # convert lists to numpy arrays, which allow for faster # lookups and use less memory self.snp_pos = np.array(snp_pos_list, dtype=np.int32) del snp_pos_list self.snp_allele1 = np.array(snp_allele1_list, dtype="|S10") del snp_allele1_list self.snp_allele2 = np.array(snp_allele2_list, dtype="|S10") del snp_allele2_list # make another array that makes it easy to lookup SNPs by their position # on the chromosome self.snp_index = np.empty(max_pos, dtype=np.int32) self.snp_index[:] = SNP_UNDEF self.snp_index[self.snp_pos-1] = np.arange(self.snp_pos.shape[0]) self.n_snp = self.snp_pos.shape[0] # currently haplotypes can only be read from HDF5 file self.haplotypes = None
def main(): args = parse_args() write_header(sys.stdout) # find index of individual in list of samples ind_idx = lookup_individual_index(args, args.individual) data_files = DataFiles(args) chrom_list = chromosome.get_all_chromosomes(args.chrom) chrom_dict = chromosome.get_chromosome_dict(args.chrom) genomewide_read_counts = get_genomewide_count(data_files.read_count_h5, chrom_list) unknown_chrom = set([]) if util.is_gzipped(args.input_file): f = gzip.open(args.input_file, "rt") else: f = open(args.input_file, "r") line_count = 0 if args.target_region_size: sys.stderr.write("setting target region size to %d\n" % args.target_region_size) for line in f: line_count += 1 if line_count % 1000 == 0: sys.stderr.write(".") if line.startswith("#"): continue words = line.rstrip().split() if words[1] == "NA": # no SNP defined on this line: write_NA_line(sys.stdout) continue chrom_name = words[0] if chrom_name in chrom_dict: chrom = chrom_dict[chrom_name] else: if not chrom_name.startswith("chr"): # try adding 'chr' to front of name new_chrom_name = "chr" + chrom_name if new_chrom_name in chrom_dict: chrom_name = new_chrom_name chrom = chrom_dict[chrom_name] else: # can't figure out this chromosome name if not chrom_name in unknown_chrom: unknown_chrom.add(chrom_name) sys.stderr.write("WARNING: unknown chromosome '%s'") continue region_list = get_target_regions(args, chrom, words) snp_pos = int(words[1]) snp_ref_base = words[3] snp_alt_base = words[4] # TODO: check that SNP ref/alt match? snp_region = coord.Coord(chrom, snp_pos, snp_pos) # pull out all of the SNPs in the target region(s) region_snps = get_region_snps(data_files, region_list, ind_idx) # pull out test SNP test_snp_list = get_region_snps(data_files, [snp_region], ind_idx) if len(test_snp_list) != 1: test_snp = None sys.stderr.write("WARNING: could not find test SNP at " "position %s:%d\n" % (chrom.name, snp_pos)) het_snps = [] else: test_snp = test_snp_list[0] # pull out haplotype counts from linked heterozygous SNPs het_snps = get_het_snps(region_snps) set_snp_counts(data_files, region_list, het_snps, test_snp, args) region_read_counts = get_region_read_counts(data_files, region_list) write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos, region_read_counts, genomewide_read_counts) sys.stderr.write("\n") f.close() data_files.close()
def main(): args = parse_args() write_header(sys.stdout) # find index of individual in list of samples ind_idx = lookup_individual_index(args, args.individual) data_files = DataFiles(args) chrom_list = chromosome.get_all_chromosomes(args.chrom) chrom_dict = chromosome.get_chromosome_dict(args.chrom) genomewide_read_counts = get_genomewide_count(data_files.read_count_h5, chrom_list) unknown_chrom = set([]) if util.is_gzipped(args.input_file): f = gzip.open(args.input_file, "rt") else: f = open(args.input_file, "rt") line_count = 0 if args.target_region_size: sys.stderr.write("setting target region size to %d\n" % args.target_region_size) for line in f: line_count += 1 if line_count % 1000 == 0: sys.stderr.write(".") if line.startswith("#"): continue words = line.rstrip().split() if words[1] == "NA": # no SNP defined on this line: write_NA_line(sys.stdout) continue chrom_name = words[0] if chrom_name in chrom_dict: chrom = chrom_dict[chrom_name] else: if not chrom_name.startswith("chr"): # try adding 'chr' to front of name new_chrom_name = "chr" + chrom_name if new_chrom_name in chrom_dict: chrom_name = new_chrom_name chrom = chrom_dict[chrom_name] else: # can't figure out this chromosome name if not chrom_name in unknown_chrom: unknown_chrom.add(chrom_name) sys.stderr.write("WARNING: unknown chromosome '%s'") continue region_list = get_target_regions(args, chrom, words) snp_pos = int(words[1]) snp_ref_base = words[3] snp_alt_base = words[4] # TODO: check that SNP ref/alt match? snp_region = coord.Coord(chrom, snp_pos, snp_pos) # pull out all of the SNPs in the target region(s) region_snps = get_region_snps(data_files, region_list, ind_idx) # pull out test SNP test_snp_list = get_region_snps(data_files, [snp_region], ind_idx) if len(test_snp_list) != 1: test_snp = None sys.stderr.write("WARNING: could not find test SNP at " "position %s:%d\n" % (chrom.name, snp_pos)) het_snps = [] else: test_snp = test_snp_list[0] # pull out haplotype counts from linked heterozygous SNPs het_snps = get_het_snps(region_snps) set_snp_counts(data_files, region_list, het_snps, test_snp, args) region_read_counts = get_region_read_counts(data_files, region_list) write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos, region_read_counts, genomewide_read_counts) sys.stderr.write("\n") f.close() data_files.close()