Exemple #1
0
def preprocess_reads(args):
    """
	Preprocesses fastq files by removing UMIs from reads and appending
	them to the read names.
	"""

    if args.config:
        config = configparser.ConfigParser()
        config.read(args.config)
        #config.read('../config/demo_config.ini')
        config_validation(conf_paths=dict(
            config.items('PATHS')))  ##Check whether PATHS in config file exist
    else:
        config = None

    prepfile = handle_arg(args.prepfile,
                          config['PATHS']['prep_file'] if config else None,
                          'ERR: No prepfile provided in args or config.')
    output_path = handle_arg(
        args.output_path, config['PATHS']['output_path'] if config else None,
        'ERR: No output path provided in args or config.')

    arg_exists(sys.argv)  ##Check whether args directories/files exist

    reheader_fastqs(r1_file=args.read1,
                    r2_file=args.read2,
                    r3_file=args.read3,
                    output_path=output_path,
                    prepname=args.prepname,
                    prepfile=prepfile)
Exemple #2
0
def collapse(args):
    """Base collapses from given BAM and umi family file."""

    if args.config:
        config = configparser.ConfigParser()
        config.read(args.config)
        config_validation(conf_paths=dict(
            config.items('PATHS')))  ##Check whether PATHS in config file exist
    else:
        config = None

    region = args.region

    if any(item not in region for item in ["chr", ":", "-"]):
        raise ValueError(
            'ERR: Incorrect region string (should look like chr1:1200000-1250000).'
        )
        sys.exit(1)

    contig = region.split(":")[0]
    region_start = int(region.split(":")[1].split("-")[0])
    region_end = int(region.split(":")[1].split("-")[1])

    bam_file = handle_arg(args.bam_file,
                          config['PATHS']['bam_file'] if config else None,
                          'ERR: No BAM file provided in args or config.')
    output_path = handle_arg(
        args.output_path, config['PATHS']['output_path'] if config else None,
        'ERR: No output path provided in args or config.')

    arg_exists(sys.argv)  ##Check whether args directories/files exist

    if args.umi_file:
        umi_file = args.umi_file
    elif config:
        umi_file = config['PATHS']['umi_file'] if 'umi_file' in config[
            'PATHS'] else None

    if umi_file:
        try:
            umi_table = pickle.load(open(umi_file, "rb"))
        except IOError:
            print("ERR: Unable to load .umis file.", file=sys.stderr)
            sys.exit(1)
    else:
        umi_table = None

    print(timestamp() + "Generating consensus...")

    generate_consensus_output(contig=contig,
                              region_start=region_start,
                              region_end=region_end,
                              bam_file=bam_file,
                              umi_table=umi_table,
                              output_path=output_path,
                              config=config)

    print(timestamp() +
          "Consensus generated. Consensus file written to {}.".format(
              output_path))
Exemple #3
0
def preprocess_reads(args):
    """
	Preprocesses fastq files by removing UMIs from reads and appending
	them to the read names.
	"""

    if args.config:
        config = configparser.ConfigParser()
        config.read(args.config)
    else:
        config = None

    prepfile = handle_arg(args.prepfile,
                          config['PATHS']['prep_file'] if config else None,
                          'ERR: No prepfile provided in args or config.')
    output_path = handle_arg(
        args.output_path, config['PATHS']['output_path'] if config else None,
        'ERR: No output path provided in args or config.')

    reheader_fastqs(r1_file=args.read1,
                    r2_file=args.read2,
                    r3_file=args.read3,
                    output_path=output_path,
                    prepname=args.prepname,
                    prepfile=prepfile)
Exemple #4
0
def group_umis(args):
    """Groups and error-corrects UMIs into families."""

    if args.config:
        config = configparser.ConfigParser()
        config.read(args.config)
    else:
        config = None

    region = args.region
    if any(item not in region for item in [":", "-"]):
        raise ValueError(
            'ERR: Incorrect region string (should look like chr1:1200000-1250000).'
        )
        sys.exit(1)

    contig = region.split(":")[0]
    region_start = int(region.split(":")[1].split("-")[0])
    region_end = int(region.split(":")[1].split("-")[1])

    bam_file = handle_arg(args.bam_file,
                          config['PATHS']['bam_file'] if config else None,
                          'ERR: No BAM file provided in args or config.')
    output_path = handle_arg(
        args.output_path, config['PATHS']['output_path'] if config else None,
        'ERR: No output path provided in args or config.')

    print(timestamp() + "Grouping UMIs...")

    ## Generate an error-corrected list of UMI families
    umi_families = get_umi_families(contig=contig,
                                    region_start=region_start,
                                    region_end=region_end,
                                    bam_file=bam_file,
                                    config=config)

    umi_file = "{}/{}.umis".format(output_path, region)
    pickle.dump(umi_families, open(umi_file, "wb"))

    print(timestamp() +
          "UMI grouping complete. Output written to {}.".format(output_path))
Exemple #5
0
def call_variants(args):
    """Generates VCF files from given cons file."""

    if args.config:
        config = configparser.ConfigParser()
        config.read(args.config)
        config_validation(conf_paths=dict(
            config.items('PATHS')))  ##Check whether PATHS in config file exist
    else:
        config = None

    cons_file = args.cons_file
    f_sizes = args.f_sizes.split(',')

    output_path = handle_arg(
        args.output_path, config['PATHS']['output_path'] if config else None,
        'No output path provided in args or config.')

    region = args.region
    arg_exists(sys.argv)  ##Check whether args directories/files exist

    cons_is_merged = check_consfile(cons_file)

    if cons_is_merged:
        region_start = region.split("_")[0]
        region_end = region.split("_")[1]

    else:
        if any(x not in region for x in ["chr", ":", "-"]):
            raise ValueError(
                'Incorrect region string (should look like chr1:1200000-1250000).'
            )
            sys.exit(1)

        contig = region.split(":")[0]
        region_start = int(region.split(":")[1].split("-")[0])
        region_end = int(region.split(":")[1].split("-")[1])

    print(timestamp() + "Generating VCFs...")

    get_vcf_output(cons_file=cons_file,
                   region_start=region_start,
                   region_end=region_end,
                   output_path=output_path,
                   config=config,
                   run_id=run_id)

    print(timestamp() +
          "VCFs generated. VCF files written to {}.".format(output_path))
Exemple #6
0
def call_variants(args):
    """Generates VCF files from given cons file."""

    if args.config:
        config = configparser.ConfigParser()
        config.read(args.config)
    else:
        config = None

    cons_file = args.cons_file

    f_sizes = args.f_sizes.split(',')

    region = args.region
    if any(x not in region for x in [":", "-"]):
        raise ValueError(
            'Incorrect region string (should look like chr1:1200000-1250000).')
        sys.exit(1)

    contig = region.split(":")[0]
    region_start = int(region.split(":")[1].split("-")[0])
    region_end = int(region.split(":")[1].split("-")[1])

    output_path = handle_arg(
        args.output_path, config['PATHS']['output_path'] if config else None,
        'No output path provided in args or config.')

    print(timestamp() + "Generating VCFs...")

    generate_vcf_output(cons_file=cons_file,
                        f_sizes=f_sizes,
                        contig=contig,
                        region_start=region_start,
                        region_end=region_end,
                        output_path=output_path,
                        config=config)

    print(timestamp() +
          "VCFs generated. VCF files written to {}.".format(output_path))
    parser.add_argument('-o', '--output_path', help='Path to write output files to.')
    parser.add_argument('-c', '--config', help='Path to your config file.')
    parser.add_argument('-t', '--tally', help='Path to your tally (output of UMI_count.py).')

    args = parser.parse_args()

    if args.config:
        config = configparser.ConfigParser()
        config.read(config_file)
    else:
        config = None
    
    region = args.region
    if any(x not in region for x in ["chr", ":", "-"]):
        raise ValueError('Incorrect region string (should look like chr1:1200000-1250000).')
        sys.exit(1)

    contig = region.split(":")[0]
    region_start = int(region.split(":")[1].split("-")[0])
    region_end = int(region.split(":")[1].split("-")[1])

    bam_file = handle_arg(args.bam_file, config['PATHS']['bam_file'] if config else None, 
                    'No BAM file provided in args or config.')
    output_path = handle_arg(args.output_path, config['PATHS']['output_path'] if config else None, 
                    'No output path provided in args or config.')
    tally_file = handle_arg(args.tally, output_path + '/' + region + '.tally' if config else None, 
                    'No tally file provided.')

    ## Output
    generate_consensus_output(contig, region_start, region_end, bam_file, tally_file, output_path, config)
Exemple #8
0
def group_umis(args):
    """Groups and error-corrects UMIs into families."""

    if args.config:
        config = configparser.ConfigParser()
        config.read(args.config)
        config_validation(conf_paths=dict(
            config.items('PATHS')))  ##Check whether PATHS in config file exist
    else:
        config = None

    region = args.region
    if any(item not in region for item in ["chr", ":", "-"]):
        raise ValueError(
            'ERR: Incorrect region string (should look like chr1:1200000-1250000).'
        )
        sys.exit(1)

    contig = region.split(":")[0]
    region_start = int(region.split(":")[1].split("-")[0])
    region_end = int(region.split(":")[1].split("-")[1])

    bam_file = handle_arg(args.bam_file,
                          config['PATHS']['bam_file'] if config else None,
                          'ERR: No BAM file provided in args or config.')
    output_path = handle_arg(
        args.output_path, config['PATHS']['output_path'] if config else None,
        'ERR: No output path provided in args or config.')

    arg_exists(sys.argv)  ##Check whether args directories/files exist

    print(timestamp() + "Grouping UMIs...")

    ## Generate an error-corrected list of UMI families
    umi_families, umi_groups = get_umi_families(contig=contig,
                                                region_start=region_start,
                                                region_end=region_end,
                                                bam_file=bam_file,
                                                config=config)

    total_parent_umi_count, total_child_umi_count, num_of_children, freq_of_parent_umis = umi_datafile(
        umi_groups)

    filename = "{}/datafile_{}.csv".format(output_path, region)
    headers = [
        'CHR', 'START', 'END', 'PTU', 'CTU', 'CHILD_NUMS', 'FREQ_PARENTS'
    ]
    csv.register_dialect('myDialect', delimiter='\t', quoting=csv.QUOTE_NONE)
    csvrow = {
        'CHR': contig,
        'START': str(region_start),
        'END': str(region_end),
        'PTU': str(total_parent_umi_count),
        'CTU': str(total_child_umi_count),
        'CHILD_NUMS': num_of_children,
        'FREQ_PARENTS': freq_of_parent_umis
    }
    info = [
        contig, region_start, region_end, total_parent_umi_count,
        total_child_umi_count, num_of_children, freq_of_parent_umis
    ]

    file = open(filename, "w")
    writer = csv.DictWriter(file, dialect='myDialect', fieldnames=headers)
    writer.writeheader()
    writer.writerow(csvrow)

    umi_file = "{}/{}.umis".format(output_path, region)
    pickle.dump(umi_families, open(umi_file, "wb"))

    print(timestamp() +
          "UMI grouping complete. Output written to {}.".format(output_path))