Esempio n. 1
0
def read_states_signals(args):
    # Read states from the annotation file
    states = ""
    with open(args.annotate_file) as f:
        for line in f:
            if len(line) < 2 or "#" in line or "=" in line:
                continue
            ll = line.strip().split(" ")
            for state in ll[1:-1]:
                states += state

    # If need to estimate bias table
    genome_data = GenomeData(args.organism)
    table = None

    # If the bias table is provided
    if args.bias_table:
        bias_table = BiasTable()
        bias_table_list = args.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])

    # Get the normalization and slope signal from the raw bam file
    raw_signal = GenomicSignal(args.reads_file)
    raw_signal.load_sg_coefs(slope_window_size=9)
    norm_signal, slope_signal = \
        raw_signal.get_signal(args.chrom, args.start, args.end,
                              args.downstream_ext, args.upstream_ext,
                              args.forward_shift, args.reverse_shift,
                              bias_table=table, genome_file_name=genome_data.get_genome())
    if args.print_bed_file:
        args.output_bed_file(states)

    return states, norm_signal, slope_signal
Esempio n. 2
0
    def __init__(self,
                 rna_fasta,
                 rna_name,
                 dna_region,
                 organism,
                 showdbs=False):
        self.organism = organism
        genome = GenomeData(organism)
        self.genome_path = genome.get_genome()
        # RNA: Path to the FASTA file
        self.rna_fasta = rna_fasta
        self.showdbs = showdbs

        rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA)
        rnas.read_fasta(self.rna_fasta)
        if rna_name:
            self.rna_name = rna_name
        else:
            self.rna_name = rnas[0].name

        # DNA: GenomicRegionSet
        self.dna_region = GenomicRegionSet(name="target")
        self.dna_region.read_bed(dna_region)
        self.dna_region = self.dna_region.gene_association(
            organism=self.organism, show_dis=True)

        self.topDBD = []
        self.stat = OrderedDict(name=rna_name, genome=organism)
        self.stat["target_regions"] = str(len(self.dna_region))
Esempio n. 3
0
def read_states_signals(args):
    # Read states from the annotation file
    states = ""
    with open(args.annotate_file) as f:
        for line in f:
            if len(line) < 2 or "#" in line or "=" in line:
                continue
            ll = line.strip().split(" ")
            for state in ll[1:-1]:
                states += state

    # If need to estimate bias table
    genome_data = GenomeData(args.organism)
    table = None

    # If the bias table is provided
    if args.bias_table:
        bias_table = BiasTable()
        bias_table_list = args.bias_table.split(",")
        table = bias_table.load_table(table_file_name_F=bias_table_list[0],
                                      table_file_name_R=bias_table_list[1])

    # Get the normalization and slope signal from the raw bam file
    raw_signal = GenomicSignal(args.reads_file)
    raw_signal.load_sg_coefs(slope_window_size=9)
    norm_signal, slope_signal = \
        raw_signal.get_signal(args.chrom, args.start, args.end,
                              args.downstream_ext, args.upstream_ext,
                              args.forward_shift, args.reverse_shift,
                              bias_table=table, genome_file_name=genome_data.get_genome())
    if args.print_bed_file:
        args.output_bed_file(states)

    return states, norm_signal, slope_signal
Esempio n. 4
0
def get_bc_signal(arguments):
    (mpbs_region, reads_file, organism, window_size, forward_shift,
     reverse_shift, bias_table) = arguments

    bam = Samfile(reads_file, "rb")
    genome_data = GenomeData(organism)
    signal = np.zeros(window_size)
    # Fetch bias corrected signal
    for region in mpbs_region:
        mid = (region.final + region.initial) // 2
        p1 = mid - window_size // 2
        p2 = mid + window_size // 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        _signal = bias_correction(chrom=region.chrom,
                                  start=p1,
                                  end=p2,
                                  bam=bam,
                                  bias_table=bias_table,
                                  genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift,
                                  reverse_shift=reverse_shift)
        if len(_signal) != window_size:
            continue

        # smooth the signal
        signal = np.add(signal, np.array(_signal))

    return signal
Esempio n. 5
0
    def __init__(self, gene_source, tf_source=None, alias_source=None,
                 filter_havana=False, protein_coding=False, known_only=False):

        # Class Objects
        self.gene_list = []  # Represents gene annotation.
        self.tf_list = []  # Represents TF PWM annotation.
        self.alias_dict = dict()  # Gene Symbol or other IDs -> ENSEMBL ID
        self.symbol_dict = dict()  # ENSEMBL ID -> Official gene symbol

        # Initializing Required Field - Gene List
        if isinstance(gene_source, list):  # It can be a matrix - Used by internal methods.
            self.gene_list = gene_source
        if isinstance(gene_source, str):  # It can be a string.
            if os.path.isfile(gene_source):  # The string may represent a path to a gtf file.
                # FTT for TDF True
                # filter_havana = False
                protein_coding = False
                known_only = False
                self.load_gene_list(gene_source,
                                    filter_havana=filter_havana,
                                    protein_coding=protein_coding,
                                    known_only=known_only)
            else:  # The string may represent an organism which points to a gtf file within data.config.
                genome_data = GenomeData(gene_source)
                self.load_gene_list(genome_data.get_annotation(),
                                    filter_havana=filter_havana,
                                    protein_coding=protein_coding,
                                    known_only=known_only)

        # Initializing Optional Field - TF List
        if tf_source:
            if isinstance(tf_source, list):
                if isinstance(tf_source[0], list):  # It can be a matrix
                    self.tf_list = tf_source
                else:
                    mtf_file_list = []
                    motif_data = MotifData()
                    for e in tf_source:
                        if os.path.isfile(e):  # It can be a path to a mtf file.
                            mtf_file_list.append(e)
                        else:  # It can represent an organism which points to an mtf file within data.config.
                            mtf_file = motif_data.get_mtf_path(e)
                            mtf_file_list.append(mtf_file)
                    self.load_tf_list(mtf_file_list)
            else:
                pass  # TODO Throw error.

        # Initializing Optional Field - Alias Dictionary
        if alias_source:
            if isinstance(alias_source, dict):  # It can be a dictionary - Used by internal methods.
                self.alias_dict = alias_source
            if isinstance(alias_source, str):  # It can be a string.
                if os.path.isfile(alias_source):  # The string may represent a path to a txt alias file.
                    self.load_alias_dict(alias_source)
                else:  # The string may represent an organism which points to a txt alias file within data.config.
                    genome_data = GenomeData(alias_source)
                    self.load_alias_dict(genome_data.get_gene_alias())
            else:
                pass  # TODO Throw error
Esempio n. 6
0
def get_bc_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    # Fetch bias corrected signal
    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1,
                                  bias_table=bias_table1, genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift, reverse_shift=reverse_shift)

        signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2,
                                  bias_table=bias_table2, genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift, reverse_shift=reverse_shift)

        if len(signal1) != len(signal_1) or len(signal2) != len(signal_2):
            continue

        # smooth the signal
        signal_1 = np.add(signal_1, np.array(signal1))
        signal_2 = np.add(signal_2, np.array(signal2))

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
Esempio n. 7
0
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG",
                        add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location,
                                "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" +
                           str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e)
                                      for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location,
                                   "{}.bw".format(args.output_prefix))
        os.system(" ".join([
            "wigToBigWig", output_fname, chrom_sizes_file, bw_filename,
            "-verbose=0"
        ]))
        os.remove(output_fname)
def main():

    cArgs = args()

    root = os.environ["RGTDATA"] if "RGTDATA" in os.environ else "/rgtdata"
    if not os.path.exists(root + "/{assembly}/genome_{assembly}.fa".format(assembly = cArgs.assembly)):
        print(
            "WARNING: genomic data is not present for {assembly}. We will attempt to download it.".format(assembly = cArgs.assembly),
            file = sys.stderr
        )
        print(
            "If you are running many jobs, they might run faster if you mount the appropriate data at {root}/{assembly}.".format(root = root, assembly = cArgs.assembly),
            file = sys.stderr
        )
        result = os.system("python3 /reg-gen/data/setupGenomicData.py --{assembly}".format(assembly = cArgs.assembly))
        if result != 0:
            print("FATAL: Unable to load genome data for {assembly}.".format(assembly = cArgs.assembly), file = sys.stderr)
            return 1

    if cArgs.occurrence_threshold is None:
        signal = footprint(cArgs.bam, cArgs.bed, cArgs.assembly, cArgs.ext_size, cArgs.dnase, cArgs.bias_type)
    else:
        g = GenomeData(organism = cArgs.assembly)
        with FilteredRegions(cArgs.bed, cArgs.occurrence_threshold, g.get_genome(), g.get_chromosome_sizes()) as b:
            signal = footprint(cArgs.bam, b.name, cArgs.assembly, cArgs.ext_size, cArgs.dnase, cArgs.bias_type)

    if cArgs.aggregate or cArgs.plot_output is not None:
        signal = aggregate(signal, (lambda x: "all") if cArgs.occurrence_threshold is None else lambda x: x["name"], cArgs.ext_size)
        if cArgs.plot_output is not None:
            plot(signal["all"]["forward"], signal["all"]["reverse"], cArgs.font, cArgs.plot_output)

    if cArgs.output_file is None:
        if not cArgs.output_as_tsv or not cArgs.aggregate:
            print(json.dumps(signal))
        else:
            for k, v in signal.items():
                if k != "all":
                    print("%s\t%s\t%s" % (k, ','.join([ str(x) for x in v["forward"] ]), ','.join([ str(x) for x in v["reverse"] ])))
    else:
        with open(cArgs.output_file, 'w') as o:
            if not cArgs.output_as_tsv or not cArgs.aggregate:
                o.write(json.dumps(signal) + '\n')
            else:
                for k, v in signal.items():
                    if k != "all":
                        o.write("%s\t%s\t%s\n" % (k, ','.join([ str(x) for x in v["forward"] ]), ','.join([ str(x) for x in v["reverse"] ])))

    return 0
Esempio n. 9
0
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
        os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
        os.remove(output_fname)
Esempio n. 10
0
def get_dbss(input_BED,output_BED,rna_fasta,output_rbss,organism,l,e,c,fr,fm,of,mf,rm,temp):
    regions = GenomicRegionSet("Target")
    regions.read_bed(input_BED)
    regions.gene_association(organism=organism, show_dis=True)

    connect_rna(rna_fasta, temp=temp, rna_name="RNA")
    rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA)
    rnas.read_fasta(os.path.join(temp,"rna_temp.fa"))
    rna_regions = get_rna_region_str(os.path.join(temp,rna_fasta))
    # print(rna_regions)
    genome = GenomeData(organism)
    genome_path = genome.get_genome()
    txp = find_triplex(rna_fasta=rna_fasta, dna_region=regions, 
                       temp=temp, organism=organism, remove_temp=False,
                       l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, genome_path=genome_path,
                       prefix="targeted_region", dna_fine_posi=True)

    print("Total binding events:\t",str(len(txp)))
    txp.write_bed(output_BED)
    txp.write_txp(filename=output_BED.replace(".bed",".txp"))
    rbss = txp.get_rbs()
    dbd_regions(exons=rna_regions, sig_region=rbss, rna_name="rna", output=output_rbss, 
                out_file=True, temp=temp, fasta=False)
Esempio n. 11
0
    def __init__(self, rna_fasta, rna_name, dna_region, organism, showdbs=False):
        self.organism = organism
        genome = GenomeData(organism)
        self.genome_path = genome.get_genome()
        # RNA: Path to the FASTA file
        self.rna_fasta = rna_fasta
        self.showdbs = showdbs

        rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA)
        rnas.read_fasta(self.rna_fasta)
        if rna_name:
            self.rna_name = rna_name
        else:
            self.rna_name = rnas[0].name

        # DNA: GenomicRegionSet
        self.dna_region = GenomicRegionSet(name="target")
        self.dna_region.read_bed(dna_region)
        self.dna_region = self.dna_region.gene_association(organism=self.organism, show_dis=True)

        self.topDBD = []
        self.stat = OrderedDict(name=rna_name, genome=organism)
        self.stat["target_regions"] = str(len(self.dna_region))
Esempio n. 12
0
    def __init__(self, gene_source, tf_source=None, alias_source=None, 
                 filter_havana=False, protein_coding=False, known_only=False):

        # Class Objects
        self.gene_list = [] # Represents gene annotation.
        self.tf_list = [] # Represents TF PWM annotation.
        self.alias_dict = dict() # Gene Symbol or other IDs -> ENSEMBL ID
        self.symbol_dict = dict() # ENSEMBL ID -> Official gene symbol

        # Initializing Required Field - Gene List
        if isinstance(gene_source,list): # It can be a matrix - Used by internal methods.
            self.gene_list = gene_source
        if isinstance(gene_source,str): # It can be a string.
            if os.path.isfile(gene_source): # The string may represent a path to a gtf file.
                # FTT for TDF True
                #filter_havana = False
                protein_coding = False
                known_only = False
                self.load_gene_list(gene_source, 
                                    filter_havana=filter_havana, 
                                    protein_coding=protein_coding,
                                    known_only=known_only)
            else: # The string may represent an organism which points to a gtf file within data.config.
                genome_data = GenomeData(gene_source)
                self.load_gene_list(genome_data.get_annotation(), 
                                    filter_havana=filter_havana, 
                                    protein_coding=protein_coding,
                                    known_only=known_only)

        # Initializing Optional Field - TF List
        if tf_source:
            if isinstance(tf_source, list):
                if isinstance(tf_source[0], list): # It can be a matrix
                    self.tf_list = tf_source
                else:
                    mtf_file_list = []
                    motif_data = MotifData()
                    for e in tf_source:
                        if os.path.isfile(e): # It can be a path to a mtf file.
                            mtf_file_list.append(e)
                        else: # It can represent an organism which points to an mtf file within data.config.
                            mtf_file = motif_data.get_mtf_path(e)
                            mtf_file_list.append(mtf_file)
                    self.tf_list = self.load_tf_list(mtf_file_list)
            else: pass # TODO Throw error.

        # Initializing Optional Field - Alias Dictionary
        if alias_source:
            if isinstance(alias_source,dict): # It can be a dictionary - Used by internal methods.
                self.alias_dict = alias_source
            if isinstance(alias_source,str): # It can be a string.
                if os.path.isfile(alias_source): # The string may represent a path to a txt alias file.
                    self.load_alias_dict(alias_source)
                else: # The string may represent an organism which points to a txt alias file within data.config.
                    genome_data = GenomeData(alias_source)
                    self.load_alias_dict(genome_data.get_gene_alias())
            else: pass # TODO Throw error
Esempio n. 13
0
class MatchTest(unittest.TestCase):
    def setUp(self):
        # the genome must be available
        # TODO: we could make this test pure by manually using the sequence corresponding to the input region
        self.genome_data = GenomeData("hg19")
        self.genome_file = Fastafile(self.genome_data.get_genome())

    def test_match_multiple(self):
        dirname = os.path.dirname(__file__)
        jasp_dir = "../../data/motifs/jaspar_vertebrates/"

        scanner = scan.Scanner(7)

        pssm_list = []
        thresholds = []

        motif = Motif(os.path.join(dirname, jasp_dir, "MA0139.1.CTCF.pwm"), 1,
                      0.0001, None)

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold_rc)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 710000, 715000)

        # Reading sequence associated to genomic_region
        sequence = str(
            self.genome_file.fetch(genomic_region.chrom,
                                   genomic_region.initial,
                                   genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences, [
            GenomicRegion(
                "chr1", 714270, 714289, name="MA0139.1.CTCF", orientation="+"),
            GenomicRegion(
                "chr1", 714180, 714199, name="MA0139.1.CTCF", orientation="-")
        ])
Esempio n. 14
0
class MatchTest(unittest.TestCase):
    def setUp(self):
        # the genome must be available
        # TODO: we could make this test pure by manually using the sequence corresponding to the input region
        self.genome_data = GenomeData("hg19")
        self.genome_data.genome = os.path.join(os.path.dirname(__file__), "hg19_chr1_710000_715000.fa")
        self.genome_file = Fastafile(self.genome_data.get_genome())

    def test_match_multiple(self):

        ms = MotifSet(preload_motifs="default")
        ms = ms.filter({'database': ["jaspar_vertebrates"], 'name': ["MA0139.1.CTCF"]}, search="inexact")

        self.assertEqual(len(ms), 1)

        motif = ms.get_motif_list(1, 0.0001)[0]

        scanner = scan.Scanner(7)

        pssm_list, thresholds = [], []

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 0, 5022)

        # Reading sequence associated to genomic_region
        sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences,
                                 [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"),
                                  GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
Esempio n. 15
0
        



##################################################################################
parser = argparse.ArgumentParser(description='Convert BED files into FASTAs', 
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files")
parser.add_argument('-output', type=str, help="Define the output directory")
parser.add_argument('-organism', type=str, help="Define the organism")
args = parser.parse_args()

if not os.path.exists(args.output):
    os.makedirs(args.output)

genome = GenomeData(args.organism)

if os.path.isfile(args.bed):
    load_exon_sequence(bed=args.bed, directory=args.output, genome_path=genome.get_genome())

elif os.path.isdir(args.bed):

    for root, dirnames, filenames in os.walk(args.bed):
            
        for filename in filenames:
            if ".bed" in filename:
                print(filename)
                fn = os.path.basename(filename)
                fn = fn.partition(".bed")[0]
                try:
                    load_exon_sequence(bed=os.path.join(args.bed,filename), 
Esempio n. 16
0
# print("\tPromoters "+str(len(promoters)))
# gd = GenomeData(organism=genome)
# print("\t"+gd.get_annotation())
# print("\tloading GenomeData... succeeds")

genome = "hg38"
print("Checking " + genome)
annot = AnnotationSet(genome,
                      filter_havana=False,
                      protein_coding=True,
                      known_only=False)
# annot = AnnotationSet(genome,filter_havana=True,protein_coding=True,known_only=True)
print("\tloading AnnotationSet... succeeds")
promoters = annot.get_promoters()
print("\tPromoters " + str(len(promoters)))
gd = GenomeData(organism=genome)
print("\t" + gd.get_annotation())
print("\tloading GenomeData... succeeds")

# genome = "mm9"
# print("Checking " + genome)
# annot = AnnotationSet(genome,filter_havana=True,protein_coding=True,known_only=True)
# print("\tloading AnnotationSet... succeeds")
# promoters = annot.get_promoters()
# print("\tPromoters "+str(len(promoters)))
# gd = GenomeData(organism=genome)
# print("\t"+gd.get_annotation())
# print("\tloading GenomeData... succeeds")

# genome = "zv9"
# print("Checking " + genome)
def footprint(bam: str,
              bed: str,
              assembly: str = "hg38",
              w: int = 500,
              dnase: bool = False,
              bias_type="SH"):

    # load HMM and bias parameters for ATAC-seq
    g = GenomeData(organism=assembly)
    hmm_data = HmmData()
    if dnase:
        hmm_file = hmm_data.get_default_hmm_dnase_bc()
        if bias_type == 'SH':
            table_F = hmm_data.get_default_bias_table_F_SH()
            table_R = hmm_data.get_default_bias_table_R_SH()
            bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                                table_file_name_R=table_R)
        elif bias_type == 'DH':
            table_F = hmm_data.get_default_bias_table_F_DH()
            table_R = hmm_data.get_default_bias_table_R_DH()
            bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                                table_file_name_R=table_R)
    else:
        hmm_file = hmm_data.get_default_hmm_atac_paired()
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                            table_file_name_R=table_R)

    # load reads from BAM
    reads_file = GenomicSignal(bam)
    reads_file.load_sg_coefs(SG_WINDOW_SIZE)

    # open data and sequence
    bam = Samfile(bam, "rb")
    fasta = Fastafile(g.get_genome())

    # load and expand regions
    with open(bed, 'r') as f:
        regions = [
            expandRegion(
                *tuple(line.strip().split()[:3]),
                line.strip().split()[3]
                if len(line.strip().split()) >= 4 else None, w,
                line.strip().split()[4]
                if len(line.strip().split()) >= 5 else '.') for line in f
        ]

    # load signal
    forward = []
    reverse = []
    failed = 0
    get_reads = reads_file.get_signal_atac if not dnase else reads_file.get_signal
    for i, x in enumerate(regions):
        try:
            chromosome, start, end, _, strand = x
            atac_norm_f, atac_slope_f, atac_norm_r, atac_slope_r = get_reads(
                chromosome, start, end, 0, 0,
                FORWARD_SHIFT if not dnase else 0,
                REVERSE_SHIFT if not dnase else 0, 1000 if dnase else 150, 98,
                98, bias_table, g.get_genome())
            atac_norm_f = [float(x) for x in atac_norm_f]
            atac_norm_r = [float(x) for x in atac_norm_r]
            if strand == '-':
                atac_norm_f.reverse()
                atac_norm_r.reverse()
            forward.append(atac_norm_f if strand != '-' else atac_norm_r)
            reverse.append(atac_norm_r if strand != '-' else atac_norm_f)
            if i % 500 == 0:
                print("INFO: aggregating region %d of %d" % (i, len(regions)),
                      file=sys.stderr)
        except:
            if len(forward) <= i: forward.append(None)
            if len(reverse) <= i: reverse.append(None)
            failed += 1
    if failed > 0:
        print(
            "WARNING: failed to generate bias-corrected signal profiles for %d regions"
            % failed,
            file=sys.stderr)

    return [
        regionDict(regions[i], forward[i], reverse[i])
        for i in range(len(regions))
        if forward[i] is not None and reverse[i] is not None
    ]
Esempio n. 18
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Esempio n. 19
0
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in r_obs_dict.keys():
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
Esempio n. 20
0
    def __init__(self, gene_source, tf_source=None, alias_source=None):
        """
        Initializes AnnotationSet.

        Keyword arguments:
        gene_source -- Gene source annotation. It will be used to create the gene_list
                       element. It can be:
            * A matrix (list of lists): An AnnotationSet will be created based on such
                 matrix.
            * A string representing a gtf file: An AnnotationSet will be created based
                 on such gtf file.
            * A string representing an organism: An AnnotationSet will be created based
                 on the gtf file for that organism in data.config file.

        tf_source -- TF source annotation. After initialization, this object is mapped with 
                     gene_list. It can be:
            * A matrix (list of lists): Represents a final tf_list element.
            * A list of mtf files: The tf_list will be created based on all mtf files.
            * A list of repositories: The tf_list will be created based on the mtf files
                associated with such repositories in data.config.

        alias_source -- Alias dictionary source annotation. It can be:
            * A dictionary: An alias dictionary will be created based on such dictionary.
            * A string representing a alias (txt) file: An alias dictionary will be created
                 based on such txt file.
            * A string representing an organism: An alias dictionary will be created based
                 on the txt file for that organism in data.config file.
        """

        # Class Objects
        self.gene_list = [] # Represents gene annotation.
        self.tf_list = [] # Represents TF PWM annotation.
        self.alias_dict = dict() # Gene Symbol or other IDs -> ENSEMBL ID
        self.symbol_dict = dict() # ENSEMBL ID -> Official gene symbol

        # Initializing Required Field - Gene List
        if(isinstance(gene_source,list)): # It can be a matrix - Used by internal methods.
            self.gene_list = gene_source
        if(isinstance(gene_source,str)): # It can be a string.
            if(os.path.isfile(gene_source)): # The string may represent a path to a gtf file.
                self.load_gene_list(gene_source, filter_havana=False, protein_coding=True,
                                    known_only=True)
            else: # The string may represent an organism which points to a gtf file within data.config.
                genome_data = GenomeData(gene_source)
                self.load_gene_list(genome_data.get_gencode_annotation(), filter_havana=False, protein_coding=True,
                                    known_only=True)

        # Initializing Optional Field - TF List
        if(tf_source):
            if(isinstance(tf_source,list)):
                if(isinstance(tf_source[0],list)): # It can be a matrix
                    self.tf_list = tf_source
                else:
                    mtf_file_list = []
                    motif_data = MotifData()
                    for e in tf_source:
                        if(os.path.isfile(e)): # It can be a path to a mtf file.
                            mtf_file_list.append(e)
                        else: # It can represent an organism which points to an mtf file within data.config.
                            mtf_file = motif_data.get_mtf_path(e)
                            mtf_file_list.append(mtf_file)
                    self.tf_list = self.load_tf_list(mtf_file_list)
            else: pass # TODO Throw error.

        # Initializing Optional Field - Alias Dictionary
        if(alias_source):
            if(isinstance(alias_source,dict)): # It can be a dictionary - Used by internal methods.
                self.alias_dict = alias_source
            if(isinstance(alias_source,str)): # It can be a string.
                if(os.path.isfile(alias_source)): # The string may represent a path to a txt alias file.
                    self.load_alias_dict(alias_source)
                else: # The string may represent an organism which points to a txt alias file within data.config.
                    genome_data = GenomeData(alias_source)
                    self.load_alias_dict(genome_data.get_gene_alias())
            else: pass # TODO Throw error
Esempio n. 21
0
##################################################################################
parser = argparse.ArgumentParser(
    description='Convert BED files into FASTAs',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-bed',
                    type=str,
                    help="BED file or a directory containing BED files")
parser.add_argument('-output', type=str, help="Define the output directory")
parser.add_argument('-organism', type=str, help="Define the organism")
args = parser.parse_args()

if not os.path.exists(args.output):
    os.makedirs(args.output)

genome = GenomeData(args.organism)

if os.path.isfile(args.bed):
    load_exon_sequence(bed=args.bed,
                       directory=args.output,
                       genome_path=genome.get_genome())

elif os.path.isdir(args.bed):

    for root, dirnames, filenames in os.walk(args.bed):

        for filename in filenames:
            if ".bed" in filename:
                print(filename)
                fn = os.path.basename(filename)
                fn = fn.partition(".bed")[0]
Esempio n. 22
0
def diff_analysis_run(args):
    # Initializing Error Handler
    err = ErrorHandler()

    output_location = os.path.join(args.output_location, "Lineplots")
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    # check if they have same length
    mpbs_files = args.mpbs_files.strip().split(",")
    reads_files = args.reads_files.strip().split(",")
    conditions = args.conditions.strip().split(",")

    if args.colors is not None:
        colors = args.colors.strip().split(",")
    else:
        colors = [
            "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33",
            "#a65628", "#f781bf", "#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3",
            "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "#8dd3c7", "#ffffb3",
            "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5",
            "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02",
            "#a6761d", "#666666", "#7fc97f", "#beaed4", "#fdc086", "#ffff99",
            "#386cb0", "#f0027f", "#bf5b17", "#666666"
        ]

    assert len(mpbs_files) == len(reads_files) == len(conditions), \
        "Number of motif, read and condition names are not same: {}, {}, {}".format(len(mpbs_files), len(reads_files),
                                                                                    len(conditions))

    # Check if the index file exists
    for reads_file in reads_files:
        base_name = "{}.bai".format(reads_file)
        if not os.path.exists(base_name):
            pysam.index(reads_file)

    mpbs = GenomicRegionSet("Motif Predicted Binding Sites of All Conditions")
    for i, mpbs_file in enumerate(mpbs_files):
        mpbs.read(mpbs_file)

    mpbs.sort()
    mpbs.remove_duplicates()
    mpbs_name_list = list(set(mpbs.get_names()))

    signals = np.zeros(shape=(len(conditions), len(mpbs_name_list),
                              args.window_size),
                       dtype=np.float32)
    motif_len = list()
    motif_num = list()
    motif_pwm = list()

    print((" {} cpus are detected and {} of them will be used...\n".format(
        cpu_count(), args.nc)))

    genome_data = GenomeData(args.organism)
    fasta = Fastafile(genome_data.get_genome())

    print("generating signal for each motif and condition...\n")
    # differential analysis using bias corrected signal
    if args.bc:
        hmm_data = HmmData()
        table_forward = hmm_data.get_default_bias_table_F_ATAC()
        table_reverse = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_forward,
                                            table_file_name_R=table_reverse)

        # do not use multi-processing
        if args.nc == 1:
            for i, condition in enumerate(conditions):
                for j, mpbs_name in enumerate(mpbs_name_list):
                    mpbs_regions = mpbs.by_names([mpbs_name])
                    arguments = (mpbs_regions, reads_files[i], args.organism,
                                 args.window_size, args.forward_shift,
                                 args.reverse_shift, bias_table)
                    try:
                        signals[i, j, :] = get_bc_signal(arguments)
                    except Exception:
                        logging.exception("get bias corrected signal failed")

                    # get motif length, number and pwm matrix
                    motif_len.append(mpbs_regions[0].final -
                                     mpbs_regions[0].initial)
                    motif_num.append(len(mpbs_regions))
                    motif_pwm.append(
                        get_pwm(fasta, mpbs_regions, args.window_size))

        # use multi-processing
        else:
            for i, condition in enumerate(conditions):
                print((
                    "generating signal for condition {} \n".format(condition)))
                with Pool(processes=args.nc) as pool:
                    arguments_list = list()
                    for mpbs_name in mpbs_name_list:
                        mpbs_regions = mpbs.by_names([mpbs_name])
                        arguments = (mpbs_regions, reads_files[i],
                                     args.organism, args.window_size,
                                     args.forward_shift, args.reverse_shift,
                                     bias_table)
                        arguments_list.append(arguments)

                        # get motif length, number and pwm matrix
                        motif_len.append(mpbs_regions[0].final -
                                         mpbs_regions[0].initial)
                        motif_num.append(len(mpbs_regions))
                        motif_pwm.append(
                            get_pwm(fasta, mpbs_regions, args.window_size))

                    res = pool.map(get_bc_signal, arguments_list)
                    signals[i] = np.array(res)

    # differential analysis using raw signal
    else:
        # do not use multi-processing
        if args.nc == 1:
            for i, condition in enumerate(conditions):
                for j, mpbs_name in enumerate(mpbs_name_list):
                    mpbs_regions = mpbs.by_names([mpbs_name])
                    arguments = (mpbs_regions, reads_files[i], args.organism,
                                 args.window_size, args.forward_shift,
                                 args.reverse_shift)
                    signals[i, j, :] = get_raw_signal(arguments)

                    # get motif length, number and pwm matrix
                    motif_len.append(mpbs_regions[0].final -
                                     mpbs_regions[0].initial)
                    motif_num.append(len(mpbs_regions))
                    motif_pwm.append(
                        get_pwm(fasta, mpbs_regions, args.window_size))

        # use multi-processing
        else:
            for i, condition in enumerate(conditions):
                print((
                    "generating signal for condition {} \n".format(condition)))
                with Pool(processes=args.nc) as pool:
                    arguments_list = list()
                    for mpbs_name in mpbs_name_list:
                        mpbs_regions = mpbs.by_names([mpbs_name])
                        arguments = (mpbs_regions, reads_files[i],
                                     args.organism, args.window_size,
                                     args.forward_shift, args.reverse_shift)
                        arguments_list.append(arguments)

                        # get motif length, number and pwm matrix
                        motif_len.append(mpbs_regions[0].final -
                                         mpbs_regions[0].initial)
                        motif_num.append(len(mpbs_regions))
                        motif_pwm.append(
                            get_pwm(fasta, mpbs_regions, args.window_size))

                    res = pool.map(get_raw_signal, arguments_list)
                    signals[i] = np.array(res)

    print("signal generation is done!\n")

    # compute normalization facotr for each condition
    factors = compute_factors(signals)
    output_factor(args, factors, conditions)

    # normalize signals by factor and number of motifs
    for i in range(len(conditions)):
        for j in range(len(mpbs_name_list)):
            signals[i, j, :] = signals[i, j, :] / (factors[i] * motif_num[j])

    if args.output_profiles:
        output_profiles(mpbs_name_list, signals, conditions,
                        args.output_location)

    print("generating line plot for each motif...\n")
    if args.nc == 1:
        for i, mpbs_name in enumerate(mpbs_name_list):
            output_line_plot(
                (mpbs_name, motif_num[i], signals[:, i, :], conditions,
                 motif_pwm[i], output_location, args.window_size, colors))
    else:
        with Pool(processes=args.nc) as pool:
            arguments_list = list()
            for i, mpbs_name in enumerate(mpbs_name_list):
                arguments_list.append(
                    (mpbs_name, motif_num[i], signals[:, i, :], conditions,
                     motif_pwm[i], output_location, args.window_size, colors))
            pool.map(output_line_plot, arguments_list)

    ps_tc_results = list()
    for i, mpbs_name in enumerate(mpbs_name_list):
        ps_tc_results.append(
            get_ps_tc_results(signals[:, i, :], motif_len[i],
                              args.window_size))

    # find the significant motifs and generate a scatter plot if two conditions are given
    if len(conditions) == 2:
        ps_tc_results = scatter_plot(args, ps_tc_results, mpbs_name_list,
                                     conditions)

    output_stat_results(ps_tc_results, conditions, mpbs_name_list, motif_num,
                        args)
Esempio n. 23
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Esempio n. 24
0
def get_bc_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG",
                        add_msg="You must specify reads and regions file.")

    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()

    reads_file = GenomicSignal()

    bam = Samfile(args.input_files[0], "rb")
    genome_data = GenomeData(args.organism)
    fasta = Fastafile(genome_data.get_genome())

    hmm_data = HmmData()
    if args.bias_table:
        bias_table_list = args.bias_table.split(",")
        bias_table = BiasTable().load_table(
            table_file_name_F=bias_table_list[0],
            table_file_name_R=bias_table_list[1])
    else:
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                            table_file_name_R=table_R)

    if args.strand_specific:
        fname_forward = os.path.join(
            args.output_location, "{}_forward.wig".format(args.output_prefix))
        fname_reverse = os.path.join(
            args.output_location, "{}_reverse.wig".format(args.output_prefix))

        f_forward = open(fname_forward, "a")
        f_reverse = open(fname_reverse, "a")
        for region in regions:
            signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length(
                ref=region.chrom,
                start=region.initial,
                end=region.final,
                bam=bam,
                fasta=fasta,
                bias_table=bias_table,
                forward_shift=args.forward_shift,
                reverse_shift=args.reverse_shift,
                min_length=None,
                max_length=None,
                strand=True)

            if args.norm:
                signal_f = reads_file.boyle_norm(signal_f)
                perc = scoreatpercentile(signal_f, 98)
                std = np.std(signal_f)
                signal_f = reads_file.hon_norm_atac(signal_f, perc, std)

                signal_r = reads_file.boyle_norm(signal_r)
                perc = scoreatpercentile(signal_r, 98)
                std = np.std(signal_r)
                signal_r = reads_file.hon_norm_atac(signal_r, perc, std)

            f_forward.write(
                "fixedStep chrom=" + region.chrom + " start=" +
                str(region.initial + 1) + " step=1\n" +
                "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n")

            f_reverse.write(
                "fixedStep chrom=" + region.chrom + " start=" +
                str(region.initial + 1) + " step=1\n" +
                "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n")

        f_forward.close()
        f_reverse.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            bw_filename = os.path.join(
                args.output_location,
                "{}_forward.bw".format(args.output_prefix))
            os.system(" ".join([
                "wigToBigWig", fname_forward, chrom_sizes_file, bw_filename,
                "-verbose=0"
            ]))
            os.remove(fname_forward)

            bw_filename = os.path.join(
                args.output_location,
                "{}_reverse.bw".format(args.output_prefix))
            os.system(" ".join([
                "wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename,
                "-verbose=0"
            ]))
            os.remove(fname_reverse)

    else:
        output_fname = os.path.join(args.output_location,
                                    "{}.wig".format(args.output_prefix))
        with open(output_fname, "a") as output_f:
            for region in regions:
                signal = reads_file.get_bc_signal_by_fragment_length(
                    ref=region.chrom,
                    start=region.initial,
                    end=region.final,
                    bam=bam,
                    fasta=fasta,
                    bias_table=bias_table,
                    forward_shift=args.forward_shift,
                    reverse_shift=args.reverse_shift,
                    min_length=None,
                    max_length=None,
                    strand=False)

                if args.norm:
                    signal = reads_file.boyle_norm(signal)
                    perc = scoreatpercentile(signal, 98)
                    std = np.std(signal)
                    signal = reads_file.hon_norm_atac(signal, perc, std)

                output_f.write(
                    "fixedStep chrom=" + region.chrom + " start=" +
                    str(region.initial + 1) + " step=1\n" +
                    "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
        output_f.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()
            bw_filename = os.path.join(args.output_location,
                                       "{}.bw".format(args.output_prefix))
            os.system(" ".join([
                "wigToBigWig", output_fname, chrom_sizes_file, bw_filename,
                "-verbose=0"
            ]))
            os.remove(output_fname)
Esempio n. 25
0
def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
Esempio n. 26
0
def get_bc_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift, bias_table1,
     bias_table2) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    # Fetch bias corrected signal
    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        signal1 = bias_correction(chrom=region.chrom,
                                  start=p1,
                                  end=p2,
                                  bam=bam1,
                                  bias_table=bias_table1,
                                  genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift,
                                  reverse_shift=reverse_shift)

        signal2 = bias_correction(chrom=region.chrom,
                                  start=p1,
                                  end=p2,
                                  bam=bam2,
                                  bias_table=bias_table2,
                                  genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift,
                                  reverse_shift=reverse_shift)

        if len(signal1) != len(signal_1) or len(signal2) != len(signal_2):
            continue

        signal_1 = np.add(signal_1, np.array(signal1))
        signal_2 = np.add(signal_2, np.array(signal2))

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
Esempio n. 27
0
    def __init__(self, gene_source, tf_source=None, alias_source=None):
        """
        Initializes AnnotationSet.

        Keyword arguments:
        gene_source -- Gene source annotation. It will be used to create the gene_list
                       element. It can be:
            * A matrix (list of lists): An AnnotationSet will be created based on such
                 matrix.
            * A string representing a gtf file: An AnnotationSet will be created based
                 on such gtf file.
            * A string representing an organism: An AnnotationSet will be created based
                 on the gtf file for that organism in data.config file.

        tf_source -- TF source annotation. After initialization, this object is mapped with 
                     gene_list. It can be:
            * A matrix (list of lists): Represents a final tf_list element.
            * A list of mtf files: The tf_list will be created based on all mtf files.
            * A list of repositories: The tf_list will be created based on the mtf files
                associated with such repositories in data.config.

        alias_source -- Alias dictionary source annotation. It can be:
            * A dictionary: An alias dictionary will be created based on such dictionary.
            * A string representing a alias (txt) file: An alias dictionary will be created
                 based on such txt file.
            * A string representing an organism: An alias dictionary will be created based
                 on the txt file for that organism in data.config file.
        """

        # Class Objects
        self.gene_list = []  # Represents gene annotation.
        self.tf_list = []  # Represents TF PWM annotation.
        self.alias_dict = dict()  # Gene Symbol or other IDs -> ENSEMBL ID
        self.symbol_dict = dict()  # ENSEMBL ID -> Official gene symbol

        # Initializing Required Field - Gene List
        if (isinstance(
                gene_source,
                list)):  # It can be a matrix - Used by internal methods.
            self.gene_list = gene_source
        if (isinstance(gene_source, str)):  # It can be a string.
            if (os.path.isfile(gene_source)
                ):  # The string may represent a path to a gtf file.
                self.load_gene_list(gene_source, filter_havana=False)
            else:  # The string may represent an organism which points to a gtf file within data.config.
                genome_data = GenomeData(gene_source)
                self.load_gene_list(genome_data.get_gencode_annotation(),
                                    filter_havana=False)

        # Initializing Optional Field - TF List
        if (tf_source):
            if (isinstance(tf_source, list)):
                if (isinstance(tf_source[0], list)):  # It can be a matrix
                    self.tf_list = tf_source
                else:
                    mtf_file_list = []
                    motif_data = MotifData()
                    for e in tf_source:
                        if (os.path.isfile(e)
                            ):  # It can be a path to a mtf file.
                            mtf_file_list.append(e)
                        else:  # It can represent an organism which points to an mtf file within data.config.
                            mtf_file = motif_data.get_mtf_path(e)
                            mtf_file_list.append(mtf_file)
                    self.tf_list = self.load_tf_list(mtf_file_list)
            else:
                pass  # TODO Throw error.

        # Initializing Optional Field - Alias Dictionary
        if (alias_source):
            if (isinstance(alias_source, dict)
                ):  # It can be a dictionary - Used by internal methods.
                self.alias_dict = alias_source
            if (isinstance(alias_source, str)):  # It can be a string.
                if (os.path.isfile(alias_source)
                    ):  # The string may represent a path to a txt alias file.
                    self.load_alias_dict(alias_source)
                else:  # The string may represent an organism which points to a txt alias file within data.config.
                    genome_data = GenomeData(alias_source)
                    self.load_alias_dict(genome_data.get_gene_alias())
            else:
                pass  # TODO Throw error
Esempio n. 28
0
def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
Esempio n. 29
0
def get_bc_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")

    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()

    reads_file = GenomicSignal()

    bam = Samfile(args.input_files[0], "rb")
    genome_data = GenomeData(args.organism)
    fasta = Fastafile(genome_data.get_genome())

    hmm_data = HmmData()
    if args.bias_table:
        bias_table_list = args.bias_table.split(",")
        bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0],
                                            table_file_name_R=bias_table_list[1])
    else:
        table_F = hmm_data.get_default_bias_table_F_ATAC()
        table_R = hmm_data.get_default_bias_table_R_ATAC()
        bias_table = BiasTable().load_table(table_file_name_F=table_F,
                                            table_file_name_R=table_R)

    if args.strand_specific:
        fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix))
        fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix))

        f_forward = open(fname_forward, "a")
        f_reverse = open(fname_reverse, "a")
        for region in regions:
            signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length(
                ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table,
                forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None,
                strand=True)

            if args.norm:
                signal_f = reads_file.boyle_norm(signal_f)
                perc = scoreatpercentile(signal_f, 98)
                std = np.std(signal_f)
                signal_f = reads_file.hon_norm_atac(signal_f, perc, std)

                signal_r = reads_file.boyle_norm(signal_r)
                perc = scoreatpercentile(signal_r, 98)
                std = np.std(signal_r)
                signal_r = reads_file.hon_norm_atac(signal_r, perc, std)

            f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                            "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n")

            f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                            "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n")

        f_forward.close()
        f_reverse.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix))
            os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"]))
            os.remove(fname_forward)

            bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix))
            os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"]))
            os.remove(fname_reverse)

    else:
        output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))
        with open(output_fname, "a") as output_f:
            for region in regions:
                signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial,
                                                                     end=region.final,
                                                                     bam=bam, fasta=fasta, bias_table=bias_table,
                                                                     forward_shift=args.forward_shift,
                                                                     reverse_shift=args.reverse_shift,
                                                                     min_length=None, max_length=None, strand=False)

                if args.norm:
                    signal = reads_file.boyle_norm(signal)
                    perc = scoreatpercentile(signal, 98)
                    std = np.std(signal)
                    signal = reads_file.hon_norm_atac(signal, perc, std)

                output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                               "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
        output_f.close()

        if args.bigWig:
            genome_data = GenomeData(args.organism)
            chrom_sizes_file = genome_data.get_chromosome_sizes()
            bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
            os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
            os.remove(output_fname)
Esempio n. 30
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Esempio n. 31
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Esempio n. 32
0
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in list(r_obs_dict.keys()):
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in list(r_obs_dict.keys()):
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
Esempio n. 33
0
# annot = AnnotationSet(genome,filter_havana=True,protein_coding=True,known_only=True)
# print("\tloading AnnotationSet... succeeds")
# promoters = annot.get_promoters()
# print("\tPromoters "+str(len(promoters)))
# gd = GenomeData(organism=genome)
# print("\t"+gd.get_annotation())
# print("\tloading GenomeData... succeeds")

genome = "hg38"
print("Checking " + genome)
annot = AnnotationSet(genome,filter_havana=False,protein_coding=True,known_only=False)
# annot = AnnotationSet(genome,filter_havana=True,protein_coding=True,known_only=True)
print("\tloading AnnotationSet... succeeds")
promoters = annot.get_promoters()
print("\tPromoters "+str(len(promoters)))
gd = GenomeData(organism=genome)
print("\t"+gd.get_annotation())
print("\tloading GenomeData... succeeds")

# genome = "mm9"
# print("Checking " + genome)
# annot = AnnotationSet(genome,filter_havana=True,protein_coding=True,known_only=True)
# print("\tloading AnnotationSet... succeeds")
# promoters = annot.get_promoters()
# print("\tPromoters "+str(len(promoters)))
# gd = GenomeData(organism=genome)
# print("\t"+gd.get_annotation())
# print("\tloading GenomeData... succeeds")

# genome = "zv9"
# print("Checking " + genome)
Esempio n. 34
0
        



##################################################################################
parser = argparse.ArgumentParser(description='Convert BED files into FASTAs', 
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files")
parser.add_argument('-output', type=str, help="Define the output directory")
parser.add_argument('-organism', type=str, help="Define the organism")
args = parser.parse_args()

if not os.path.exists(args.output):
    os.makedirs(args.output)

genome = GenomeData(args.organism)

if os.path.isfile(args.bed):
    load_exon_sequence(bed=args.bed, directory=args.output, genome_path=genome.get_genome())

elif os.path.isdir(args.bed):

    for root, dirnames, filenames in os.walk(args.bed):
            
        for filename in filenames:
            if ".bed" in filename:
                print(filename)
                fn = os.path.basename(filename)
                fn = fn.partition(".bed")[0]
                try:
                    load_exon_sequence(bed=os.path.join(args.bed,filename), 
Esempio n. 35
0
from rgt.Util import GenomeData
import argparse 
import os        

##################################################################################
parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', 
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files")
parser.add_argument('-output', type=str, help="Define the output directory")
parser.add_argument('-organism', type=str, help="Define the organism")
args = parser.parse_args()




genome = GenomeData(args.organism)

if os.path.isfile(args.bed):
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(args.bed)
    gr = regionset.gene_association(organism=args.organism, promoterLength=1000, 
                                    threshDist=500000, show_dis=True)
    regionset.replace_region_name(gr,combine=True)
    
    regionset.write_bed(args.output)

elif os.path.isdir(args.bed):
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    for root, dirnames, filenames in os.walk(args.bed):
            
Esempio n. 36
0
 def setUp(self):
     # the genome must be available
     # TODO: we could make this test pure by manually using the sequence corresponding to the input region
     self.genome_data = GenomeData("hg19")
     self.genome_data.genome = os.path.join(os.path.dirname(__file__), "hg19_chr1_710000_715000.fa")
     self.genome_file = Fastafile(self.genome_data.get_genome())
Esempio n. 37
0
 def setUp(self):
     # the genome must be available
     # TODO: we could make this test pure by manually using the sequence corresponding to the input region
     self.genome_data = GenomeData("hg19")
     self.genome_file = Fastafile(self.genome_data.get_genome())
Esempio n. 38
0
import sys
import pandas as pd
from pysam import Samfile
from rgt.GenomicRegionSet import GenomicRegionSet
from rgt.Util import GenomeData

tf_file = sys.argv[1]
bam_file = sys.argv[2]
output_file = sys.argv[3]

gr_tfs = GenomicRegionSet(name="TFs")
gr_tfs.read(filename=tf_file)
gr_genes = gr_tfs.gene_association(organism="hg38")

# Fetching chromosome sizes
genome_data = GenomeData("hg38")
chrom_sizes_file_name = genome_data.get_chromosome_sizes()
chrom_sizes_file = open(chrom_sizes_file_name, "r")
chrom_sizes_dict = dict()
for chrom_sizes_entry_line in chrom_sizes_file:
	chrom_sizes_entry_vec = chrom_sizes_entry_line.strip().split("\t")
	chrom_sizes_dict[chrom_sizes_entry_vec[0]] = int(chrom_sizes_entry_vec[1])
chrom_sizes_file.close()

bam = Samfile(bam_file, "rb")

tf_list = list()
gene_list = list()
tc_list = list()

for i, r in enumerate(gr_tfs):