Esempio n. 1
0
def metagene_bin():
    '''Main program for metagene_bin.py'''

    arguments = get_arguments()

    for infile in arguments.input:
        print "Processing file:\t{}".format(infile)

        # returns a dict of file names with keys of orientation:gap_counting
        output_files = build_output_filenames(infile, arguments.output_prefix,
                                              arguments.window_size,
                                              arguments.step_size,
                                              arguments.separate_groups)

        with open(infile, 'r') as inf:
            metagene = inf.readline()
            for output in output_files.values():
                with open(output, 'w') as outf:
                    outf.write(metagene)  # needed for plotting
                    outf.write(
                        "Gene,Orientation,Gapped,Window,Inclusive_Start,Inclusive_End,Abundance\n"
                    )

            header = inf.readline().strip().split(",")
            positions = header[2:]  # positions relative to gene start

            for counts_line in read_chunk(inf, 1024):
                counts_parts = counts_line.strip().split(",")
                counts = counts_parts[2:]
                length = len(counts)
                (orientation, gap) = counts_parts[1].split(":")
                output = "{},{},{}".format(counts_parts[0], orientation, gap)

                window = 0
                exclusive_end = arguments.window_size

                while exclusive_end <= length:
                    inclusive_start = exclusive_end - arguments.window_size

                    coverage = 0.0

                    for i in range(inclusive_start, exclusive_end):
                        coverage += float(counts[i])

                    with open(output_files[counts_parts[1]], 'a') as outf:
                        outf.write("{},{},{},{},{}\n".format(
                            output, window, positions[inclusive_start],
                            positions[exclusive_end - 1], coverage))

                    window += 1
                    exclusive_end += arguments.step_size
def metagene_bin():
    '''Main program for metagene_bin.py'''

    arguments = get_arguments()

    for infile in arguments.input:
        print "Processing file:\t{}".format(infile)

        # returns a dict of file names with keys of orientation:gap_counting
        output_files = build_output_filenames(infile, arguments.output_prefix, arguments.window_size,
                                              arguments.step_size, arguments.separate_groups)

        with open(infile, 'r') as inf:
            metagene = inf.readline()
            for output in output_files.values():
                with open(output, 'w') as outf:
                    outf.write(metagene)  # needed for plotting
                    outf.write("Gene,Orientation,Gapped,Window,Inclusive_Start,Inclusive_End,Abundance\n")

            header = inf.readline().strip().split(",")
            positions = header[2:]  # positions relative to gene start

            for counts_line in read_chunk(inf, 1024):
                counts_parts = counts_line.strip().split(",")
                counts = counts_parts[2:]
                length = len(counts)
                (orientation, gap) = counts_parts[1].split(":")
                output = "{},{},{}".format(counts_parts[0], orientation, gap)

                window = 0
                exclusive_end = arguments.window_size

                while exclusive_end <= length:
                    inclusive_start = exclusive_end - arguments.window_size

                    coverage = 0.0

                    for i in range(inclusive_start, exclusive_end):
                        coverage += float(counts[i])

                    with open(output_files[counts_parts[1]], 'a') as outf:
                        outf.write("{},{},{},{},{}\n".format(output, window, positions[inclusive_start],
                                                             positions[exclusive_end - 1], coverage))

                    window += 1
                    exclusive_end += arguments.step_size
def metagene_count():
    """Chain of command for metagene_count analysis."""
    arguments = get_arguments()
    # confirm BAM file and extract chromosome sizes
    Read.set_chromosome_sizes(arguments.alignment)
    ##TODO: create a list of chromosomes to analyze and/or exclude
    # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM)
    Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys())

    # define has_abundance and has_mappings tags for Read class
    Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)")
    Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)")

    # define the metagene array shape (left padding, start, internal, end, right padding)
    # metagene = padding ---- internal region ---- padding 
    try:
        metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding)
        print "Metagene definition:\t{}".format(metagene)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the metagene template")

    try:
        Feature.set_format(arguments.feature)  # assign file format for the feature file
        print "Reading feature file as {} format".format(Feature.format)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the feature object")

    # print out the header line...
    if not arguments.interval_variable:
        with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file:
            output_file.write("# Metagene:\t{}\n".format(metagene))  # define for plotting later
            output_file.write(metagene.print_full())

    # for each feature
    with open(arguments.feature, 'r') as feature_file:
        for feature_line in read_chunk(feature_file, 1024):
            if feature_line[0] != "#":  # skip comment lines
                # change creation with feature_method
                feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing,
                                         arguments.ignore_strand)

                # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather
                # than Feature.get_chromosome_region() because only the first ensures that the interval does not
                # extend beyond the length of the chromosome which makes samtools view return no reads
                (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format(
                    arguments.alignment,
                    feature.get_samtools_region())])
                if run_pipe_worked:
                    for samline in sam_sample:
                        if len(samline) > 0:
                            # create Read feature
                            (created_read, read) = Read.create_from_sam(samline,
                                                                        Feature.chromosome_conversion.values(),
                                                                        arguments.count_method,
                                                                        arguments.uniquely_mapping,
                                                                        arguments.ignore_strand,
                                                                        arguments.count_secondary_alignments,
                                                                        arguments.count_failed_quality_control,
                                                                        arguments.count_PCR_optical_duplicate,
                                                                        arguments.count_supplementary_alignment)

                            # count read (if it exists)
                            if created_read:
                                feature.count_read(read, arguments.count_method, arguments.count_splicing,
                                                   arguments.count_partial_reads, arguments.ignore_strand)

                    # output the resulting metagene
                    with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file:
                        output_file.write(
                            "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable)))

                else:
                    raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format(
                        feature.get_chromosome_region(),
                        feature.name,
                        arguments.alignment))
Esempio n. 4
0
def metagene_count():
    """Chain of command for metagene_count analysis."""
    arguments = get_arguments()
    # confirm BAM file and extract chromosome sizes
    Read.set_chromosome_sizes(arguments.alignment)
    ##TODO: create a list of chromosomes to analyze and/or exclude
    # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM)
    Feature.set_chromosome_conversion(arguments.chromosome_names,
                                      Read.chromosome_sizes.keys())

    # define has_abundance and has_mappings tags for Read class
    Read.set_sam_tag(arguments.extract_abundance, arguments.alignment,
                     "NA:i:(\d+)")
    Read.set_sam_tag(arguments.extract_mappings, arguments.alignment,
                     "NH:i:(\d+)")

    # define the metagene array shape (left padding, start, internal, end, right padding)
    # metagene = padding ---- internal region ---- padding
    try:
        metagene = Metagene(arguments.interval_size, arguments.padding,
                            arguments.padding)
        print "Metagene definition:\t{}".format(metagene)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the metagene template")

    try:
        Feature.set_format(
            arguments.feature)  # assign file format for the feature file
        print "Reading feature file as {} format".format(Feature.format)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the feature object")

    # print out the header line...
    if not arguments.interval_variable:
        with open("{}.metagene_counts.csv".format(arguments.output_prefix),
                  'w') as output_file:
            output_file.write("# Metagene:\t{}\n".format(
                metagene))  # define for plotting later
            output_file.write(metagene.print_full())

    # for each feature
    with open(arguments.feature, 'r') as feature_file:
        for feature_line in read_chunk(feature_file, 1024):
            if feature_line[0] != "#":  # skip comment lines
                # change creation with feature_method
                feature = Feature.create(arguments.feature_count, metagene,
                                         feature_line,
                                         arguments.count_splicing,
                                         arguments.ignore_strand)

                # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather
                # than Feature.get_chromosome_region() because only the first ensures that the interval does not
                # extend beyond the length of the chromosome which makes samtools view return no reads
                (run_pipe_worked, sam_sample) = run_pipe([
                    'samtools view {} {}'.format(arguments.alignment,
                                                 feature.get_samtools_region())
                ])
                if run_pipe_worked:
                    for samline in sam_sample:
                        if len(samline) > 0:
                            # create Read feature
                            (created_read, read) = Read.create_from_sam(
                                samline,
                                Feature.chromosome_conversion.values(),
                                arguments.count_method,
                                arguments.uniquely_mapping,
                                arguments.ignore_strand,
                                arguments.count_secondary_alignments,
                                arguments.count_failed_quality_control,
                                arguments.count_PCR_optical_duplicate,
                                arguments.count_supplementary_alignment)

                            # count read (if it exists)
                            if created_read:
                                feature.count_read(
                                    read, arguments.count_method,
                                    arguments.count_splicing,
                                    arguments.count_partial_reads,
                                    arguments.ignore_strand)

                    # output the resulting metagene
                    with open(
                            "{}.metagene_counts.csv".format(
                                arguments.output_prefix), 'a') as output_file:
                        output_file.write("{}\n".format(
                            feature.print_metagene(interval_override=arguments.
                                                   interval_variable)))

                else:
                    raise MetageneError(
                        "Could not pull chromosomal region {} for feature {} from BAM file {}."
                        .format(feature.get_chromosome_region(), feature.name,
                                arguments.alignment))