Beispiel #1
0
def open_input_files(in_filename):
    if not os.path.exists(in_filename) or not os.path.isfile(in_filename):
        sys.stderr.write("input file %s does not exist or is not a regular file\n" %
                         in_filename)
        exit(2)

    # read file that contains list of input files
    in_file = open(in_filename)

    infiles = []
    for line in in_file:
        # open each input file and read first line
        filename = line.rstrip()
        if not filename or not os.path.exists(filename) or not os.path.isfile(filename):
            sys.stderr.write("input file '%s' does not exist or is not a regular file\n"
                             % line)
            exit(2)
        if util.is_gzipped(filename):
            f = gzip.open(filename, "rt")
        else:
            f = open(filename)

        # skip header
        f.readline()

        infiles.append(f)
    in_file.close()

    if len(infiles) == 0:
        sys.stderr.write("no input files specified in file '%s'\n" % options.infile_list)
        exit(2)

    return infiles
Beispiel #2
0
def open_input_files(in_filename):
    if not os.path.exists(in_filename) or not os.path.isfile(in_filename):
        raise IOError("input file %s does not exist or is not a "
                      "regular file\n" % in_filename)

    # read file that contains list of input files
    in_file = open(in_filename, "rt")

    infiles = []
    for line in in_file:
        # open each input file and read first line
        filename = line.rstrip()
        sys.stderr.write(" " + filename + "\n")
        if (not filename) or (not os.path.exists(filename)) or \
           (not os.path.isfile(filename)):
            sys.stderr.write("input file '%s' does not exist or is not a "
                             "regular file\n" % in_file)
            exit(2)
        if util.is_gzipped(filename):
            f = gzip.open(filename, "rt")
        else:
            f = open(filename, "rt")

        # skip header
        f.readline()

        infiles.append(f)
    in_file.close()

    if len(infiles) == 0:
        sys.stderr.write("no input files specified in file '%s'\n" %
                         in_filename)
        exit(2)

    return infiles
Beispiel #3
0
    def read_file(self, filename):
        """read in SNPs and indels from text input file"""
        try:
            if util.is_gzipped(filename):
                f = gzip.open(filename, "rt")
            else:
                f = open(filename, "rt")
        except IOError:
            sys.stderr.write("WARNING: unable to read from file '%s', "
                             "assuming no SNPs for this chromosome\n" %
                             filename)
            self.clear()
            return
        
        snp_pos_list = []
        snp_allele1_list = []
        snp_allele2_list = []
        max_pos = 0

        for line in f:
            words = line.split()

            if(len(words) < 3):
                raise ValueError("expected at least 3 values per SNP "
              			 "file line but got %d:\n"
                                 "%s\n" % (len(words), line))
            
            pos = int(words[0])
            a1 = words[1].upper().replace("-", "")
            a2 = words[2].upper().replace("-", "")

            if pos <= 0:
                raise ValueError("expected SNP position to be >= 1:\n%s\n" %
                                 line)

            if pos > max_pos:
                max_pos = pos

            snp_pos_list.append(pos)
            snp_allele1_list.append(a1)
            snp_allele2_list.append(a2)

        f.close()

        # convert lists to numpy arrays, which allow for faster
        # lookups and use less memory
        self.snp_pos = np.array(snp_pos_list, dtype=np.int32)
        del snp_pos_list
        self.snp_allele1 = np.array(snp_allele1_list, dtype="|S10")
        del snp_allele1_list
        self.snp_allele2 = np.array(snp_allele2_list, dtype="|S10")
        del snp_allele2_list

        # make another array that makes it easy to lookup SNPs by their position
        # on the chromosome
        self.snp_index = np.empty(max_pos, dtype=np.int32)
        self.snp_index[:] = SNP_UNDEF
        self.snp_index[self.snp_pos-1] = np.arange(self.snp_pos.shape[0])

        self.n_snp = self.snp_pos.shape[0]
Beispiel #4
0
def get_all_chromosomes(filename):
    if util.is_gzipped(filename):
        f = gzip.open(filename, "rt")
    else:
        f = open(filename, "rt")

    chrom_list = []

    for line in f:
        words = line.rstrip().split()

        if len(words) < 2:
            raise ValueError("expected at least two columns per line\n")

        chrom = Chromosome(name=words[0], length=int(words[1]))
        chrom_list.append(chrom)

        lc_name = chrom.name.lower()

        # determine whether this is autosome, sex or mitochondrial chrom
        if re.match('^chr(\d+)', lc_name):
            chrom.is_auto = True
        elif re.match("^chr[W-Zw-z]", lc_name):
            chrom.is_sex = True
        elif lc_name.startswith("chrm"):
            chrom.is_mito = True
        elif lc_name.startswith("chrun") or lc_name.startswith("chrur"):
            chrom.is_rand = True
        else:
            sys.stderr.write("WARNING: could not determine chromosome type "
                             "(autosome, sex, mitochondrial) from name "
                             "'%s'. Assuming 'random'\n" % chrom.name)
            chrom.is_rand = True

        if "rand" in chrom.name:
            # random chromosome
            chrom.is_rand = True

        if "hap" in chrom.name:
            # alt haplotype chromosome
            chrom.is_hap = True

    chrom_list.sort(key=Chromosome.key)

    idnum = 1
    for chrom in chrom_list:
        chrom.idnum = idnum
        idnum += 1

    f.close()

    return chrom_list
Beispiel #5
0
def get_all_chromosomes(filename):
    if util.is_gzipped(filename):
        f = gzip.open(filename, "rt")
    else:
        f = open(filename)

    chrom_list = []
    
    for line in f:
        words = line.rstrip().split()

        if len(words) < 2:
            raise ValueError("expected at least two columns per line\n")
        
        chrom = Chromosome(name=words[0], length=int(words[1]))
        chrom_list.append(chrom)

        lc_name = chrom.name.lower()

        # determine whether this is autosome, sex or mitochondrial chrom
        if re.match('^chr(\d+)', lc_name):
            chrom.is_auto=True
        elif re.match("^chr[W-Zw-z]", lc_name):
            chrom.is_sex = True
        elif lc_name.startswith("chrm"):
            chrom.is_mito = True
        elif lc_name.startswith("chrun") or lc_name.startswith("chrur"):
            chrom.is_rand = True
        else:
            sys.stderr.write("WARNING: could not determine chromosome type "
                             "(autosome, sex, mitochondrial) from name "
                             "'%s'. Assuming 'random'\n" % chrom.name)
            chrom.is_rand = True

        if "rand" in chrom.name:
            # random chromosome
            chrom.is_rand = True

        if "hap" in chrom.name:
            # alt haplotype chromosome
            chrom.is_hap = True

    chrom_list.sort(key=Chromosome.key)

    idnum = 1
    for chrom in chrom_list:
        chrom.idnum = idnum
        idnum += 1

    f.close()

    return chrom_list
def open_files(file_list, r_w):
    files = []

    for filename in file_list:
        if r_w.startswith("r"):
            if util.is_gzipped(filename):
                files.append(gzip.open(filename, r_w))
            else:
                files.append(open(filename, r_w))
        else:
            if filename.endswith(".gz"):
                files.append(gzip.open(filename, r_w))
            else:
                files.append(open(filename, r_w))

    return (files)
Beispiel #7
0
def open_files(file_list, r_w):
    files=[]
    
    for filename in file_list:
        if r_w.startswith("r"):
            if util.is_gzipped(filename):
                files.append(gzip.open(filename, r_w))
            else:
                files.append(open(filename, r_w))
        else:
            if filename.endswith(".gz"):
                files.append(gzip.open(filename, r_w))
            else:
                files.append(open(filename, r_w))
                
    
    return(files)
Beispiel #8
0
def parse_samples(samples_str):
    """Gets list of samples from --samples argument. This may be 
    a comma-delimited string or a path to a file. If a file is provided 
    then the first column of the file is assumed to be the sample name"""

    if samples_str is None:
        return None
        
    # first check if this is a path to a file
    if os.path.exists(samples_str) and not os.path.isdir(samples_str):
        samples = []

        if util.is_gzipped(samples_str):
            f = gzip.open(samples_str)
        else:
            f = open(samples_str)

        for line in f:
            # assume first token in line is sample name
            samples.append(line.split()[0])

        sys.stderr.write("read %d sample names from file '%s'\n" %
                         (len(samples), samples_str))
                    
        f.close()
    else:    
        # otherwise assume comma-delimited string
        if ("," not in samples_str and len(samples_str) > 15) \
           or ("/" in samples_str):
            sys.stderr.write("WARNING: --samples argument (%s) "
                             "does not look like sample name "
                             "but is not path to valid file. "
                             "Assuming it is a sample name anyway."
                             % samples_str)

        samples = samples_str.split(",")
        sys.stderr.write("SAMPLES: %s\n"% repr(samples))


    return samples
Beispiel #9
0
def parse_samples(samples_str):
    """Gets list of samples from --samples argument. This may be 
    a comma-delimited string or a path to a file. If a file is provided 
    then the first column of the file is assumed to be the sample name"""

    if samples_str is None:
        return None
        
    # first check if this is a path to a file
    if os.path.exists(samples_str) and not os.path.isdir(samples_str):
        samples = []

        if util.is_gzipped(samples_str):
            f = gzip.open(samples_str, "rt")
        else:
            f = open(samples_str, "rt")

        for line in f:
            # assume first token in line is sample name
            samples.append(line.split()[0])

        sys.stderr.write("read %d sample names from file '%s'\n" %
                         (len(samples), samples_str))
                    
        f.close()
    else:    
        # otherwise assume comma-delimited string
        if ("," not in samples_str and len(samples_str) > 15) \
           or ("/" in samples_str):
            sys.stderr.write("WARNING: --samples argument (%s) "
                             "does not look like sample name "
                             "but is not path to valid file. "
                             "Assuming it is a sample name anyway."
                             % samples_str)

        samples = samples_str.split(",")
        sys.stderr.write("SAMPLES: %s\n"% repr(samples))


    return samples
def main():
    error = 0.01
    args = parse_options()

    if util.is_gzipped(args.infile):
        infile = gzip.open(args.infile, "rt")
    else:
        infile = open(args.infile, "r")
        
    if args.outfile.endswith(".gz"):
        outfile = gzip.open(args.outfile,"w")
    else:
        outfile = open(args.outfile,"w")

    ref_count_h5 = tables.openFile(args.ref_as_counts)
    alt_count_h5 = tables.openFile(args.alt_as_counts)

    snp_line = infile.readline()
    if snp_line:
        outfile.write(snp_line)
    else:
        sys.stderr.write("The input file was empty.\n")
        exit(-1)

    snp_line = infile.readline()
    while snp_line:
        snpinfo = snp_line.strip().split()
        if snpinfo[9] == "NA":
            outfile.write(snp_line)
        else:
            new_hetps = process_one_snp(snpinfo, ref_count_h5, 
                                        alt_count_h5, error)
            outfile.write("\t".join(snpinfo[:10] + 
                                    [";".join(new_hetps)] + 
                                    snpinfo[11:]) + "\n")
        snp_line = infile.readline()

    ref_count_h5.close()
    alt_count_h5.close()
Beispiel #11
0
def main():
    error = 0.01
    args = parse_options()

    if util.is_gzipped(args.infile):
        infile = gzip.open(args.infile, "rt")
    else:
        infile = open(args.infile, "rt")
        
    if args.outfile.endswith(".gz"):
        outfile = gzip.open(args.outfile, "wt")
    else:
        outfile = open(args.outfile, "wt")

    ref_count_h5 = tables.open_file(args.ref_as_counts)
    alt_count_h5 = tables.open_file(args.alt_as_counts)

    snp_line = infile.readline()
    if snp_line:
        outfile.write(snp_line)
    else:
        sys.stderr.write("The input file was empty.\n")
        exit(-1)

    snp_line = infile.readline()
    while snp_line:
        snpinfo = snp_line.strip().split()
        if snpinfo[9] == "NA":
            outfile.write(snp_line)
        else:
            new_hetps = process_one_snp(snpinfo, ref_count_h5, 
                                        alt_count_h5, error)
            outfile.write("\t".join(snpinfo[:10] + 
                                    [";".join(new_hetps)] + 
                                    snpinfo[11:]) + "\n")
        snp_line = infile.readline()

    ref_count_h5.close()
    alt_count_h5.close()
Beispiel #12
0
    def read_file(self, filename):
        """read in SNPs and indels from text input file"""
        try:
            if util.is_gzipped(filename):
                f = gzip.open(filename, "rt")
            else:
                f = open(filename, "rt")
        except IOError:
            sys.stderr.write("WARNING: unable to read from file '%s', "
                             "assuming no SNPs for this chromosome\n" %
                             filename)
            self.clear()
            return
        
        snp_pos_list = []
        snp_allele1_list = []
        snp_allele2_list = []
        max_pos = 0

        for line in f:
            words = line.split()

            if(len(words) < 3):
                raise ValueError("expected at least 3 values per SNP "
              			 "file line but got %d:\n"
                                 "%s\n" % (len(words), line))
            
            pos = int(words[0])
            a1 = words[1].upper().replace("-", "")
            a2 = words[2].upper().replace("-", "")

            if pos <= 0:
                raise ValueError("expected SNP position to be >= 1:\n%s\n" %
                                 line)

            if pos > max_pos:
                max_pos = pos

            snp_pos_list.append(pos)
            snp_allele1_list.append(a1)
            snp_allele2_list.append(a2)

        f.close()

        # convert lists to numpy arrays, which allow for faster
        # lookups and use less memory
        self.snp_pos = np.array(snp_pos_list, dtype=np.int32)
        del snp_pos_list
        self.snp_allele1 = np.array(snp_allele1_list, dtype="|S10")
        del snp_allele1_list
        self.snp_allele2 = np.array(snp_allele2_list, dtype="|S10")
        del snp_allele2_list

        # make another array that makes it easy to lookup SNPs by their position
        # on the chromosome
        self.snp_index = np.empty(max_pos, dtype=np.int32)
        self.snp_index[:] = SNP_UNDEF
        self.snp_index[self.snp_pos-1] = np.arange(self.snp_pos.shape[0])

        self.n_snp = self.snp_pos.shape[0]

        # currently haplotypes can only be read from HDF5 file
        self.haplotypes = None
Beispiel #13
0
def main():
    args = parse_args()
    write_header(sys.stdout)

    # find index of individual in list of samples
    ind_idx = lookup_individual_index(args, args.individual)

    data_files = DataFiles(args)

    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)

    genomewide_read_counts = get_genomewide_count(data_files.read_count_h5,
                                                  chrom_list)

    unknown_chrom = set([])

    if util.is_gzipped(args.input_file):
        f = gzip.open(args.input_file, "rt")
    else:
        f = open(args.input_file, "r")

    line_count = 0

    if args.target_region_size:
        sys.stderr.write("setting target region size to %d\n" %
                         args.target_region_size)

    for line in f:
        line_count += 1
        if line_count % 1000 == 0:
            sys.stderr.write(".")

        if line.startswith("#"):
            continue

        words = line.rstrip().split()

        if words[1] == "NA":
            # no SNP defined on this line:
            write_NA_line(sys.stdout)
            continue

        chrom_name = words[0]
        if chrom_name in chrom_dict:
            chrom = chrom_dict[chrom_name]
        else:
            if not chrom_name.startswith("chr"):
                # try adding 'chr' to front of name
                new_chrom_name = "chr" + chrom_name
                if new_chrom_name in chrom_dict:
                    chrom_name = new_chrom_name
                    chrom = chrom_dict[chrom_name]
                else:
                    # can't figure out this chromosome name
                    if not chrom_name in unknown_chrom:
                        unknown_chrom.add(chrom_name)
                        sys.stderr.write("WARNING: unknown chromosome '%s'")
                    continue

        region_list = get_target_regions(args, chrom, words)

        snp_pos = int(words[1])
        snp_ref_base = words[3]
        snp_alt_base = words[4]
        # TODO: check that SNP ref/alt match?

        snp_region = coord.Coord(chrom, snp_pos, snp_pos)

        # pull out all of the SNPs in the target region(s)
        region_snps = get_region_snps(data_files, region_list, ind_idx)

        # pull out test SNP
        test_snp_list = get_region_snps(data_files, [snp_region], ind_idx)
        if len(test_snp_list) != 1:
            test_snp = None
            sys.stderr.write("WARNING: could not find test SNP at "
                             "position %s:%d\n" % (chrom.name, snp_pos))
            het_snps = []
        else:
            test_snp = test_snp_list[0]

            # pull out haplotype counts from linked heterozygous SNPs
            het_snps = get_het_snps(region_snps)
            set_snp_counts(data_files, region_list, het_snps, test_snp, args)

        region_read_counts = get_region_read_counts(data_files, region_list)

        write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos,
                     region_read_counts, genomewide_read_counts)

    sys.stderr.write("\n")
    f.close()
    data_files.close()
def main():
    args = parse_args()

    write_header(sys.stdout)

    # find index of individual in list of samples
    ind_idx = lookup_individual_index(args, args.individual)
    
    data_files = DataFiles(args)

    chrom_list = chromosome.get_all_chromosomes(args.chrom)
    chrom_dict = chromosome.get_chromosome_dict(args.chrom)
    
    genomewide_read_counts = get_genomewide_count(data_files.read_count_h5,
                                                  chrom_list)


    unknown_chrom = set([])
    
    if util.is_gzipped(args.input_file):
        f = gzip.open(args.input_file, "rt")
    else:
        f = open(args.input_file, "rt")

    line_count = 0

    if args.target_region_size:
        sys.stderr.write("setting target region size to %d\n" %
                         args.target_region_size)
    
    for line in f:
        line_count += 1
        if line_count % 1000 == 0:
            sys.stderr.write(".")

        if line.startswith("#"):
            continue
        
        words = line.rstrip().split()

        if words[1] == "NA":
            # no SNP defined on this line:
            write_NA_line(sys.stdout)
            continue
        
        chrom_name = words[0]
        if chrom_name in chrom_dict:
            chrom = chrom_dict[chrom_name]
        else:
            if not chrom_name.startswith("chr"):
                # try adding 'chr' to front of name
                new_chrom_name = "chr" + chrom_name
                if new_chrom_name in chrom_dict:
                    chrom_name = new_chrom_name
                    chrom = chrom_dict[chrom_name]
                else:
                    # can't figure out this chromosome name
                    if not chrom_name in unknown_chrom:
                        unknown_chrom.add(chrom_name)
                        sys.stderr.write("WARNING: unknown chromosome '%s'")
                    continue
                    
        
        region_list = get_target_regions(args, chrom, words)

        snp_pos = int(words[1])
        snp_ref_base = words[3]
        snp_alt_base = words[4]
        # TODO: check that SNP ref/alt match?
                    
        snp_region = coord.Coord(chrom, snp_pos, snp_pos)
        
        # pull out all of the SNPs in the target region(s)
        region_snps = get_region_snps(data_files, region_list, ind_idx)

        # pull out test SNP
        test_snp_list = get_region_snps(data_files, [snp_region], ind_idx)
        if len(test_snp_list) != 1:
            test_snp = None
            sys.stderr.write("WARNING: could not find test SNP at "
                             "position %s:%d\n" % (chrom.name, snp_pos))
            het_snps = []
        else:
            test_snp = test_snp_list[0]
                
            # pull out haplotype counts from linked heterozygous SNPs
            het_snps = get_het_snps(region_snps)
            set_snp_counts(data_files, region_list, het_snps, test_snp, args)

        region_read_counts = get_region_read_counts(data_files, region_list)

        write_output(sys.stdout, region_list, het_snps, test_snp, snp_pos,
                     region_read_counts, genomewide_read_counts)

    sys.stderr.write("\n")
    f.close()
    data_files.close()