Example #1
0
def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    snp_files = args
    if len(snp_files)<2:
        parser.error("Need more than two SNP files.")
        sys.exit(1)

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    for (required_opt, opt_descr) in [
            (opts.seq_len, "sequence length"),
            ]:
        if not required_opt:
            LOG.fatal("Missing %s argument" % opt_descr)
            sys.exit(1)
 
    LOG.info("Init sig-level=%f" % (SIG_LEVEL))

    excl_pos = []
    if opts.fexclude:
        excl_pos = read_exclude_pos_file(opts.fexclude)
        LOG.info("Excluding %d positions" % len(excl_pos))

    snps = []
    for snp_file in snp_files:
        more_snps = snp.parse_snp_file(snp_file)
        snps.extend(more_snps)
        LOG.info("Parsed %d SNPs from %s" % (len(more_snps), snp_file))
        
    print "#coldspots"
    print "#excluding positions listed in %s" % opts.fexclude
    print "#considering snvs listed in %s" % (', '.join(snp_files))
    find_coldspot_regions(snps, opts.seq_len, excl_pos)
Example #2
0
def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if len(args):
        parser.error("Unrecognized arguments found: %s." % (
            ' '.join(args)))
        sys.exit(1)

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    for (required_opt, opt_descr) in [(opts.win_size, "window size"), 
                                      (opts.step_size, "step size"),
                                      (opts.snp_file, "SNP file"),
                                      (opts.seq_len, "sequence length")]:
        if not required_opt:
            LOG.fatal("Missing %s argument" % opt_descr)
            sys.exit(1)

    excl_pos = []
    if opts.fexclude:
        excl_pos = read_exclude_pos_file(opts.fexclude)
        LOG.info("Excluding %d positions" % len(excl_pos))
            
    snps = snp.parse_snp_file(opts.snp_file)
    LOG.info("Parsed %d SNPs from %s" % (len(snps), opts.snp_file))

    find_hotspot_windows(snps, opts.seq_len, opts.win_size, 
                         opts.step_size, excl_pos)
Example #3
0
def main():
    """The main function
    """

    parser = cmdline_parser()
    args = parser.parse_args()

    # FIXME catch unrecognized args (not just (len(args)

    if args.verbose:
        LOG.setLevel(logging.INFO)
    if args.debug:
        LOG.setLevel(logging.DEBUG)


    for (in_file, descr) in [(args.var_file, "variant file")]:
        if not in_file:
            parser.error("%s input file argument missing." % descr)
            sys.exit(1)
        if not os.path.exists(in_file) and in_file != "-":
            sys.stderr.write(
                "file '%s' does not exist.\n" % in_file)
            sys.exit(1)
            
    for (out_file, descr) in [(args.cluster_file, "cluster output file")]:
        if not out_file:
            parser.error("%s output file argument missing." % descr)
            sys.exit(1)
        if os.path.exists(out_file) and out_file!="-":
            sys.stderr.write(
                "Cowardly refusing to overwrite existing"
                " output file '%s'.\n" % out_file)
            sys.exit(1)


    # A lot of code for supporting legacy SNP format. 
    #
    # FIXME this and MetaVar() should use vcf by default and just
    # convert snp to vcf
    #
    is_vcf = False
    if HAVE_VCF_MODULE:
        if args.var_file == '-':
            vcf_fh = sys.stdin
        else:
            vcf_fh = open(args.var_file)
            # FIXME gzip support
        vcf_reader = vcf.VCFReader(vcf_fh)
        try:
            var_list = [MetaVar(vcf_var=r)
                        for r in vcf_reader]
            is_vcf = True
        except:
            raise
        if vcf_fh != sys.stdin:
            vcf_fh.close()
    is_snp = False
    if not is_vcf and HAVE_SNP_MODULE:
        try:
            var_list = [MetaVar(snp_var=s) 
                        for s in snp.parse_snp_file(args.var_file)]
            is_snp = True
        except IndexError:
            pass

    if not is_snp and not is_vcf:
        LOG.error("Can't parse %s. Tried the following formats: %s" % (
            args.var_file, ', '.join(SUPPORTED_FORMATS)))
        sys.exit(1)

    
    LOG.info("Parsed %d SNPs from %s" % (len(var_list), args.var_file))

    
    var_list =  sorted(var_list, key=lambda x: x.freq, reverse=True)

    if args.cluster_file == '-':
        fh_out = sys.stdout
    else:
        fh_out = open(args.cluster_file, 'w')

        
    if len(var_list)==0:
        fh_out.write("No SNPs <-> no clusters!\n")
        if fh_out != sys.stdout:
            print "No SNPs <-> no clusters!"
            fh_out.close()
        sys.exit(0)

        
    cluster = dict()
    clu_no = 0
    seed = var_list[0]
    #cluster[clu_no,'members'] = ["%s %f" % (seed.repr, seed.freq)]
    cluster[clu_no,'members'] = ["%s" % (seed.repr)]
    cluster[clu_no,'min'] = seed.min_ci
    cluster[clu_no,'max'] = seed.max_ci

    for var in var_list[1:]:
        LOG.debug("checking %s %f: max_ci %f vvar. clu_min %f" % (
            var.repr, var.freq, var.max_ci, cluster[clu_no,'min']))
        if var.max_ci > cluster[clu_no,'min']:
            #cluster[clu_no,'members'].append("%s %f" % (var.repr, var.freq))
            cluster[clu_no,'members'].append("%s" % (var.repr))
        else:
            clu_no += 1
            seed = var
            #cluster[clu_no,'members'] = ["%s %f" % (seed.repr, seed.freq)]
            cluster[clu_no,'members'] = ["%s" % (seed.repr)]
            cluster[clu_no,'min'] = seed.min_ci
            cluster[clu_no,'max'] = seed.max_ci

        
    for i in range(clu_no+1):
        fh_out.write("cluster %d (freq. range: %f - %f): %s\n" % (
            i+1, cluster[i,'min'], cluster[i,'max'], 
            ', '.join(cluster[i,'members'])))
        
    if fh_out != sys.stdout:
        fh_out.close()
    print "%d clusters found (written to %s)" % (clu_no+1, fh_out.name)
Example #4
0
def main():
    """The main function
    """

    parser = cmdline_parser()
    args = parser.parse_args()

    # FIXME catch unrecognized args (not just (len(args)

    if args.verbose:
        LOG.setLevel(logging.INFO)
    if args.debug:
        LOG.setLevel(logging.DEBUG)

    for (in_file, descr) in [(args.var_file, "variant file")]:
        if not in_file:
            parser.error("%s input file argument missing." % descr)
            sys.exit(1)
        if not os.path.exists(in_file) and in_file != "-":
            sys.stderr.write("file '%s' does not exist.\n" % in_file)
            sys.exit(1)

    for (out_file, descr) in [(args.cluster_file, "cluster output file")]:
        if not out_file:
            parser.error("%s output file argument missing." % descr)
            sys.exit(1)
        if os.path.exists(out_file) and out_file != "-":
            sys.stderr.write("Cowardly refusing to overwrite existing"
                             " output file '%s'.\n" % out_file)
            sys.exit(1)

    # A lot of code for supporting legacy SNP format.
    #
    # FIXME this and MetaVar() should use vcf by default and just
    # convert snp to vcf
    #
    is_vcf = False
    if HAVE_VCF_MODULE:
        if args.var_file == '-':
            vcf_fh = sys.stdin
        else:
            vcf_fh = open(args.var_file)
            # FIXME gzip support
        vcf_reader = vcf.VCFReader(vcf_fh)
        try:
            var_list = [MetaVar(vcf_var=r) for r in vcf_reader]
            is_vcf = True
        except:
            raise
        if vcf_fh != sys.stdin:
            vcf_fh.close()
    is_snp = False
    if not is_vcf and HAVE_SNP_MODULE:
        try:
            var_list = [
                MetaVar(snp_var=s) for s in snp.parse_snp_file(args.var_file)
            ]
            is_snp = True
        except IndexError:
            pass

    if not is_snp and not is_vcf:
        LOG.error("Can't parse %s. Tried the following formats: %s" %
                  (args.var_file, ', '.join(SUPPORTED_FORMATS)))
        sys.exit(1)

    LOG.info("Parsed %d SNPs from %s" % (len(var_list), args.var_file))

    var_list = sorted(var_list, key=lambda x: x.freq, reverse=True)

    if args.cluster_file == '-':
        fh_out = sys.stdout
    else:
        fh_out = open(args.cluster_file, 'w')

    if len(var_list) == 0:
        fh_out.write("No SNPs <-> no clusters!\n")
        if fh_out != sys.stdout:
            print "No SNPs <-> no clusters!"
            fh_out.close()
        sys.exit(0)

    cluster = dict()
    clu_no = 0
    seed = var_list[0]
    #cluster[clu_no,'members'] = ["%s %f" % (seed.repr, seed.freq)]
    cluster[clu_no, 'members'] = ["%s" % (seed.repr)]
    cluster[clu_no, 'min'] = seed.min_ci
    cluster[clu_no, 'max'] = seed.max_ci

    for var in var_list[1:]:
        LOG.debug("checking %s %f: max_ci %f vvar. clu_min %f" %
                  (var.repr, var.freq, var.max_ci, cluster[clu_no, 'min']))
        if var.max_ci > cluster[clu_no, 'min']:
            #cluster[clu_no,'members'].append("%s %f" % (var.repr, var.freq))
            cluster[clu_no, 'members'].append("%s" % (var.repr))
        else:
            clu_no += 1
            seed = var
            #cluster[clu_no,'members'] = ["%s %f" % (seed.repr, seed.freq)]
            cluster[clu_no, 'members'] = ["%s" % (seed.repr)]
            cluster[clu_no, 'min'] = seed.min_ci
            cluster[clu_no, 'max'] = seed.max_ci

    for i in range(clu_no + 1):
        fh_out.write("cluster %d (freq. range: %f - %f): %s\n" %
                     (i + 1, cluster[i, 'min'], cluster[i, 'max'], ', '.join(
                         cluster[i, 'members'])))

    if fh_out != sys.stdout:
        fh_out.close()
    print "%d clusters found (written to %s)" % (clu_no + 1, fh_out.name)