コード例 #1
0
ファイル: MotifSet.py プロジェクト: CostaLab/reg-gen
    def get_motif_list(self, pseudocounts=1.0, fpr=0.0001):

        motif_list = []

        # iterate over all available PWM files
        for motif_dir_path in self.motif_data.pwm_list:

            # iterate over all motif elements in this set
            for motif_name, ma in self.motifs_map.items():
                motif_file_name = os.path.join(motif_dir_path, motif_name + ".pwm")

                # if the motif annotation has a corresponding PWM file, add to return list
                if os.path.isfile(motif_file_name):
                    # check whether ma provides the motif matching threshold for the given fpr
                    # recalculate (and store) it otherwise
                    if fpr in ma.thresholds and ma.thresholds[fpr]:
                        threshold = ma.thresholds[fpr]
                    else:
                        pfm = parsers.pfm(str(motif_file_name))
                        bg = tools.flat_bg(len(pfm))  # total number of "points" to add, not per-row
                        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
                        threshold = tools.threshold_from_p(pssm, bg, fpr)
                        ma.thresholds[fpr] = threshold

                    motif_list.append(Motif(motif_file_name, pseudocounts, threshold))

        return motif_list
コード例 #2
0
    def get_motif_list(self, pseudocounts=1.0, fpr=0.0001):

        motif_list = []

        # iterate over all available PWM files
        for motif_dir_path in self.motif_data.pwm_list:

            # iterate over all motif elements in this set
            for motif_name, ma in self.motifs_map.items():
                motif_file_name = os.path.join(motif_dir_path,
                                               motif_name + ".pwm")

                # if the motif annotation has a corresponding PWM file, add to return list
                if os.path.isfile(motif_file_name):
                    # check whether ma provides the motif matching threshold for the given fpr
                    # recalculate (and store) it otherwise
                    if fpr in ma.thresholds and ma.thresholds[fpr]:
                        threshold = ma.thresholds[fpr]
                    else:
                        pfm = parsers.pfm(str(motif_file_name))
                        bg = tools.flat_bg(
                            len(pfm)
                        )  # total number of "points" to add, not per-row
                        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
                        threshold = tools.threshold_from_p(pssm, bg, fpr)
                        ma.thresholds[fpr] = threshold

                    motif_list.append(
                        Motif(motif_file_name, pseudocounts, threshold))

        return motif_list
コード例 #3
0
    def test_match_multiple(self):

        ms = MotifSet(preload_motifs="default")
        ms = ms.filter({'database': ["jaspar_vertebrates"], 'name': ["MA0139.1.CTCF"]}, search="inexact")

        self.assertEqual(len(ms), 1)

        motif = ms.get_motif_list(1, 0.0001)[0]

        scanner = scan.Scanner(7)

        pssm_list, thresholds = [], []

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 0, 5022)

        # Reading sequence associated to genomic_region
        sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences,
                                 [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"),
                                  GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
コード例 #4
0
    def test_match_multiple(self):
        dirname = os.path.dirname(__file__)
        jasp_dir = "../../data/motifs/jaspar_vertebrates/"

        scanner = scan.Scanner(7)

        pssm_list = []
        thresholds = []

        motif = Motif(os.path.join(dirname, jasp_dir, "MA0139.1.CTCF.pwm"), 1, 0.0001, None)

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold_rc)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 0, 5022)

        # Reading sequence associated to genomic_region
        sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences,
                                 [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"),
                                  GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
コード例 #5
0
ファイル: Motif.py プロジェクト: alexyfyf/reg-gen
    def __init__(self, input_file_name, pseudocounts, fpr, thresholds):
        """ 
        Initializes Motif.

        Fields:
        pfm -- Position Frequency Matrix.
        bg -- Background frequencies.
        pssm -- Position Specific Scoring Matrix.
        alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"]
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])
        repository = input_file_name.split("/")[-2]

        # Creating PFM & PSSM
        self.pfm = parsers.pfm(str(input_file_name))
        self.bg = tools.flat_bg(len(
            self.pfm))  # total number of "points" to add, not per-row
        self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2)
        self.pssm_rc = tools.reverse_complement(self.pssm)

        # how many bases this motif has
        self.len = len(self.pfm[0])

        # maximum value found in the whole PSSM
        self.max = max([max(e) for e in self.pssm])

        # we only support pure DNA or methylated DNA, for now.
        self.alphabet = ["Aa", "Cc", "Gg", "Tt"]
        if len(self.pfm) == 6:
            self.alphabet += ["m", "1"]

        # Evaluating threshold
        try:
            if pseudocounts != 1.0:
                raise ValueError()
            self.threshold = thresholds.dict[repository][self.name][fpr]
        except Exception:
            # FIXME: this requires a modified version of MOODS. Not sure if we actually need it.
            # self.threshold = tools.threshold_from_p(self.pssm, self.bg, fpr, 2000.0)  # 10000.0 would take too long
            self.threshold = tools.threshold_from_p(self.pssm, self.bg, fpr)
            print(">>> recomputing threshold for %s: %f" %
                  (self.name, self.threshold))

        self.threshold_rc = tools.threshold_from_p(self.pssm_rc, self.bg, fpr)

        self.consensus = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm, axis=0)])
        self.consensus_rc = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)])

        # Evaluating if motif is palindromic
        self.is_palindrome = self.consensus == self.consensus_rc
コード例 #6
0
    def __init__(self, input_file_name, pseudocounts, threshold):
        """ 
        Initializes Motif.

        Fields:
        pfm -- Position Frequency Matrix.
        bg -- Background frequencies.
        pssm -- Position Specific Scoring Matrix.
        alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"]
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])

        # Creating PFM & PSSM
        self.pfm = parsers.pfm(str(input_file_name))
        self.bg = tools.flat_bg(len(
            self.pfm))  # total number of "points" to add, not per-row
        self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2)
        self.pssm_rc = tools.reverse_complement(self.pssm)

        # how many bases this motif has
        self.len = len(self.pfm[0])

        # maximum value found in the whole PSSM
        self.max = max([max(e) for e in self.pssm])

        # we only support pure DNA or methylated DNA, for now.
        self.alphabet = ["Aa", "Cc", "Gg", "Tt"]
        if len(self.pfm) == 6:
            self.alphabet += ["m", "1"]

        self.threshold = threshold

        self.consensus = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm, axis=0)])
        self.consensus_rc = "".join(
            [self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)])

        # Evaluating if motif is palindromic
        self.is_palindrome = self.consensus == self.consensus_rc
コード例 #7
0
ファイル: Motif.py プロジェクト: CostaLab/reg-gen
    def __init__(self, input_file_name, pseudocounts, threshold):
        """ 
        Initializes Motif.

        Fields:
        pfm -- Position Frequency Matrix.
        bg -- Background frequencies.
        pssm -- Position Specific Scoring Matrix.
        alphabet -- A list of letters, eg ["Aa", "Cc", "Gg", "Tt"]
        threshold -- Motif matching threshold.
        len -- Length of the motif.
        max -- Maximum PSSM score possible.
        is_palindrome -- True if consensus is biologically palindromic.
        """

        # Initializing name
        self.name = ".".join(basename(input_file_name).split(".")[:-1])

        # Creating PFM & PSSM
        self.pfm = parsers.pfm(str(input_file_name))
        self.bg = tools.flat_bg(len(self.pfm))  # total number of "points" to add, not per-row
        self.pssm = tools.log_odds(self.pfm, self.bg, pseudocounts, 2)
        self.pssm_rc = tools.reverse_complement(self.pssm)

        # how many bases this motif has
        self.len = len(self.pfm[0])

        # maximum value found in the whole PSSM
        self.max = max([max(e) for e in self.pssm])

        # we only support pure DNA or methylated DNA, for now.
        self.alphabet = ["Aa", "Cc", "Gg", "Tt"]
        if len(self.pfm) == 6:
            self.alphabet += ["m", "1"]

        self.threshold = threshold

        self.consensus = "".join([self.alphabet[i][0] for i in argmax(self.pssm, axis=0)])
        self.consensus_rc = "".join([self.alphabet[i][0] for i in argmax(self.pssm_rc, axis=0)])

        # Evaluating if motif is palindromic
        self.is_palindrome = self.consensus == self.consensus_rc
コード例 #8
0
ファイル: createMtf.py プロジェクト: CostaLab/reg-gen
        gene_names = hocomoco_anno[pwm_name][0]
        group = hocomoco_anno[pwm_name][1]
        if not group:
            group = "."
        uniprot = hocomoco_anno[pwm_name][2]
        data_source = hocomoco_anno[pwm_name][3]
        taxGroup = "vertebrates"
        species = (pwm_name.split("_")[1]).split(".")[0]
        if species == "HUMAN":
            species = "H**o sapiens"
        elif species == "MOUSE":
            species = "Mus musculus"

        # Creating PSSM
        pfm = parsers.pfm(inputFileName)
        bg = tools.flat_bg(len(pfm))  # total number of "points" to add, not per-row
        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
        threshold_list = []

        # Evaluating thresholds
        for fpr in fprList:
            # Note: this requires a modified version of MOODS. Only use it if you know what you are doing
            # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0)))
            threshold = tools.threshold_from_p(pssm, bg, fpr)
            threshold_list.append(str(threshold))
        threshold = ",".join(threshold_list)

        resultMatrix.append([matrix_id, pwm_name, version, gene_names, group, uniprot, data_source, taxGroup,
                             species, threshold])

    # Sorting results by ID
コード例 #9
0
ファイル: Match.py プロジェクト: michael-kotliar/reg-gen
def main(args):
    """
    Performs motif matching.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    err = ErrorHandler()

    # Additional Parameters
    matching_folder_name = "match"
    random_region_name = "random_regions"

    filter_values = parse_filter(args.filter)

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    if args.output_location:
        output_location = args.output_location
    else:
        output_location = npath(matching_folder_name)
    print(">> output location:", output_location)

    # Default genomic data
    genome_data = GenomeData(args.organism)

    print(">> genome:", genome_data.organism)
    print(">> pseudocounts:", args.pseudocounts)
    print(">> fpr threshold:", args.fpr)

    ###################################################################################################
    # Reading Input Regions
    ###################################################################################################

    genomic_regions_dict = {}

    # get experimental matrix, if available
    if args.input_matrix:
        try:
            exp_matrix = ExperimentalMatrix()
            exp_matrix.read(args.input_matrix)

            # if the matrix is present, the (empty) dictionary is overwritten
            genomic_regions_dict = exp_matrix.objectsDict

            print(">>> experimental matrix loaded")

        except Exception:
            err.throw_error("MM_WRONG_EXPMAT")
    elif args.input_files:
        # get input files, if available
        for input_filename in args.input_files:
            name, _ = os.path.splitext(os.path.basename(input_filename))

            regions = GenomicRegionSet(name)
            regions.read(npath(input_filename))

            genomic_regions_dict[name] = regions

            print(">>> input file", name, "loaded:", len(regions), "regions")

    # we put this here because we don't want to create the output directory unless we
    # are sure the initialisation (including loading input files) worked
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    annotation = None
    target_genes = None
    # get promoter regions from list of genes (both target and background)
    # TODO: should be more clever, allow precomputed regions etc
    if args.target_genes_filename:
        annotation = AnnotationSet(args.organism,
                                   alias_source=args.organism,
                                   protein_coding=True,
                                   known_only=True)

        target_genes = GeneSet("target_genes")
        target_genes.read(args.target_genes_filename)

        # TODO: what do we do with unmapped genes? maybe just print them out
        target_regions = annotation.get_promoters(
            gene_set=target_genes, promoter_length=args.promoter_length)
        target_regions.name = "target_regions"
        target_regions.sort()
        output_file_name = npath(
            os.path.join(output_location, target_regions.name + ".bed"))
        target_regions.write(output_file_name)

        genomic_regions_dict[target_regions.name] = target_regions

        print(">>> target promoter file created:", len(target_regions),
              "regions")

    # we make a background in case it's requested, but also in case a list of target genes has not been
    # provided
    if args.promoter_make_background or (args.promoters_only
                                         and not args.target_genes_filename):
        if not annotation:
            annotation = AnnotationSet(args.organism,
                                       alias_source=args.organism,
                                       protein_coding=True,
                                       known_only=True)

        # background is made of all known genes minus the target genes (if any)
        background_genes = GeneSet("background_genes")
        background_genes.get_all_genes(organism=args.organism)

        if target_genes:
            background_genes.subtract(target_genes)

        background_regions = annotation.get_promoters(
            gene_set=background_genes, promoter_length=args.promoter_length)
        background_regions.name = "background_regions"
        background_regions.sort()
        output_file_name = npath(
            os.path.join(output_location, background_regions.name + ".bed"))
        background_regions.write(output_file_name)

        genomic_regions_dict[background_regions.name] = background_regions

        print(">>> background promoter file created:", len(background_regions),
              "regions")

    if not genomic_regions_dict:
        err.throw_error(
            "DEFAULT_ERROR",
            add_msg=
            "You must either specify an experimental matrix, or at least a "
            "valid input file, or one of the 'promoter test' options.")

    max_region_len = 0
    max_region = None
    regions_to_match = []

    # Iterating on experimental matrix objects
    for k in genomic_regions_dict.keys():

        curr_genomic_region = genomic_regions_dict[k]

        # If the object is a GenomicRegionSet
        if isinstance(curr_genomic_region, GenomicRegionSet):

            if args.rmdup:
                # remove duplicates and sort regions
                curr_genomic_region.remove_duplicates(sort=True)
            else:
                # sort regions
                curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            regions_to_match.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if curr_len > max_region_len:
                max_region_len = curr_len
                max_region = curr_genomic_region

    print(">> all files loaded")

    ###################################################################################################
    # Creating random regions
    ###################################################################################################

    # if a random proportion is set, create random regions
    if args.rand_proportion:

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(
            args.organism, multiply_factor=args.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Add random regions to the list of regions to perform matching on
        regions_to_match.append(rand_region)

        # Writing random regions
        output_file_name = npath(
            os.path.join(output_location, random_region_name))
        rand_bed_file_name = output_file_name + ".bed"
        rand_region.write(rand_bed_file_name)

        # Verifying condition to write bb
        if args.bigbed:

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            try:
                # Converting to big bed
                bed_to_bb(rand_bed_file_name, chrom_sizes_file)

                # removing previously-created BED file
                os.remove(rand_bed_file_name)
            except Exception:
                err.throw_warning(
                    "DEFAULT_WARNING")  # FIXME: maybe error instead?

        print(">> random regions file created:", len(rand_region), "regions")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    if args.motif_dbs:
        ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)
        # filter for dbs only if --motif_dbs is not set
        if 'database' in filter_values:
            del filter_values['database']
    else:
        if 'database' in filter_values:
            ms = MotifSet(preload_motifs=filter_values['database'])
        else:
            ms = MotifSet(preload_motifs="default")

    print(">> used database(s):",
          ",".join([str(db) for db in ms.motif_data.repositories_list]))

    # applying filtering pattern, taking a subset of the motif set
    if args.filter:
        ms = ms.filter(filter_values, search=args.filter_type)

    motif_list = ms.get_motif_list(args.pseudocounts, args.fpr)

    print(">> motifs loaded:", len(motif_list))

    # Performing normalized threshold strategy if requested
    if args.norm_threshold:
        threshold_list = [motif.threshold / motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list) / len(threshold_list)
    else:
        unique_threshold = None

    scanner = scan.Scanner(7)
    pssm_list = []
    thresholds = []
    for motif in motif_list:
        if unique_threshold:
            thresholds.append(0.0)
            thresholds.append(0.0)
        else:
            thresholds.append(motif.threshold)
            thresholds.append(motif.threshold)

        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

    # Performing motif matching
    # TODO: we can expand this to use bg from sequence, for example,
    # or from organism.
    bg = tools.flat_bg(4)
    scanner.set_motifs(pssm_list, bg, thresholds)

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    print()

    # Iterating on list of genomic region sets
    for grs in regions_to_match:

        start = time.time()
        print(">> matching [",
              grs.name,
              "], ",
              len(grs),
              " regions... ",
              sep="",
              end='')
        sys.stdout.flush()

        # Initializing output bed file
        output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed")

        # must remove it because we append the MPBS
        if os.path.isfile(output_bed_file):
            os.remove(output_bed_file)

        # Iterating on genomic region set
        for genomic_region in grs:

            # Reading sequence associated to genomic_region
            sequence = str(
                genome_file.fetch(genomic_region.chrom, genomic_region.initial,
                                  genomic_region.final))

            grs_tmp = match_multiple(scanner, motif_list, sequence,
                                     genomic_region)

            # post-processing: if required, remove duplicate regions on opposing strands (keep highest score)
            if len(grs_tmp) > 1 and args.remove_strand_duplicates:
                grs_tmp.sort()
                seqs = grs_tmp.sequences
                seqs_new = []
                cur_pos = 0
                end_pos = len(seqs) - 1
                while cur_pos < end_pos:
                    gr = seqs[cur_pos]

                    new_pos = cur_pos + 1
                    while new_pos < end_pos:
                        gr2 = seqs[new_pos]

                        # if this sequence is unrelated, we move on
                        if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation:
                            break

                        if float(gr.data) < float(gr2.data):
                            gr = gr2

                        new_pos = new_pos + 1

                    # adding the currently-selected genomic region
                    seqs_new.append(gr)

                    # at the next loop, we start from the next right-handed sequences
                    cur_pos = new_pos

                # edge case: the last element was not considered
                # (when it is, cur_pos == end_pos+1)
                if cur_pos == end_pos:
                    seqs_new.append(seqs[cur_pos])

                grs_tmp.sequences = seqs_new

            grs_tmp.write(output_bed_file, mode="a")

        del grs.sequences[:]

        # Verifying condition to write bb
        if args.bigbed and args.normalize_bitscore:
            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            bed_to_bb(output_bed_file, chrom_sizes_file)

            # removing BED file
            os.remove(output_bed_file)

        secs = time.time() - start
        print("[", "%02.3f" % secs, " seconds]", sep="")
コード例 #10
0
ファイル: createMtf.py プロジェクト: michael-kotliar/reg-gen
        gene_names = hocomoco_anno[pwm_name][0]
        group = hocomoco_anno[pwm_name][1]
        if not group:
            group = "."
        uniprot = hocomoco_anno[pwm_name][2]
        data_source = hocomoco_anno[pwm_name][3]
        taxGroup = "vertebrates"
        species = (pwm_name.split("_")[1]).split(".")[0]
        if species == "HUMAN":
            species = "H**o sapiens"
        elif species == "MOUSE":
            species = "Mus musculus"

        # Creating PSSM
        pfm = parsers.pfm(inputFileName)
        bg = tools.flat_bg(
            len(pfm))  # total number of "points" to add, not per-row
        pssm = tools.log_odds(pfm, bg, pseudocounts, 2)
        threshold_list = []

        # Evaluating thresholds
        for fpr in fprList:
            # Note: this requires a modified version of MOODS. Only use it if you know what you are doing
            # resVec.append(str(tools.threshold_from_p(pssm, bg, fpr, 10000.0)))
            threshold = tools.threshold_from_p(pssm, bg, fpr)
            threshold_list.append(str(threshold))
        threshold = ",".join(threshold_list)

        resultMatrix.append([
            matrix_id, pwm_name, version, gene_names, group, uniprot,
            data_source, taxGroup, species, threshold
        ])
コード例 #11
0
ファイル: Match.py プロジェクト: CostaLab/reg-gen
def main(args):
    """
    Performs motif matching.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    err = ErrorHandler()

    # Additional Parameters
    matching_folder_name = "match"
    random_region_name = "random_regions"

    filter_values = parse_filter(args.filter)

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    if args.output_location:
        output_location = args.output_location
    else:
        output_location = npath(matching_folder_name)
    print(">> output location:", output_location)

    # Default genomic data
    genome_data = GenomeData(args.organism)

    print(">> genome:", genome_data.organism)
    print(">> pseudocounts:", args.pseudocounts)
    print(">> fpr threshold:", args.fpr)

    ###################################################################################################
    # Reading Input Regions
    ###################################################################################################

    genomic_regions_dict = {}

    # get experimental matrix, if available
    if args.input_matrix:
        try:
            exp_matrix = ExperimentalMatrix()
            exp_matrix.read(args.input_matrix)

            # if the matrix is present, the (empty) dictionary is overwritten
            genomic_regions_dict = exp_matrix.objectsDict

            print(">>> experimental matrix loaded")

        except Exception:
            err.throw_error("MM_WRONG_EXPMAT")
    elif args.input_files:
        # get input files, if available
        for input_filename in args.input_files:
            name, _ = os.path.splitext(os.path.basename(input_filename))

            regions = GenomicRegionSet(name)
            regions.read(npath(input_filename))

            genomic_regions_dict[name] = regions

            print(">>> input file", name, "loaded:", len(regions), "regions")

    # we put this here because we don't want to create the output directory unless we
    # are sure the initialisation (including loading input files) worked
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    annotation = None
    target_genes = None
    # get promoter regions from list of genes (both target and background)
    # TODO: should be more clever, allow precomputed regions etc
    if args.target_genes_filename:
        annotation = AnnotationSet(args.organism, alias_source=args.organism,
                                   protein_coding=True, known_only=True)

        target_genes = GeneSet("target_genes")
        target_genes.read(args.target_genes_filename)

        # TODO: what do we do with unmapped genes? maybe just print them out
        target_regions = annotation.get_promoters(gene_set=target_genes, promoter_length=args.promoter_length)
        target_regions.name = "target_regions"
        target_regions.sort()
        output_file_name = npath(os.path.join(output_location, target_regions.name + ".bed"))
        target_regions.write(output_file_name)

        genomic_regions_dict[target_regions.name] = target_regions

        print(">>> target promoter file created:", len(target_regions), "regions")

    # we make a background in case it's requested, but also in case a list of target genes has not been
    # provided
    if args.promoter_make_background or (args.promoters_only and not args.target_genes_filename):
        if not annotation:
            annotation = AnnotationSet(args.organism, alias_source=args.organism,
                                       protein_coding=True, known_only=True)

        # background is made of all known genes minus the target genes (if any)
        background_genes = GeneSet("background_genes")
        background_genes.get_all_genes(organism=args.organism)

        if target_genes:
            background_genes.subtract(target_genes)

        background_regions = annotation.get_promoters(gene_set=background_genes,
                                                      promoter_length=args.promoter_length)
        background_regions.name = "background_regions"
        background_regions.sort()
        output_file_name = npath(os.path.join(output_location, background_regions.name + ".bed"))
        background_regions.write(output_file_name)

        genomic_regions_dict[background_regions.name] = background_regions

        print(">>> background promoter file created:", len(background_regions), "regions")

    if not genomic_regions_dict:
        err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, or at least a "
                                                 "valid input file, or one of the 'promoter test' options.")

    max_region_len = 0
    max_region = None
    regions_to_match = []

    # Iterating on experimental matrix objects
    for k in genomic_regions_dict.keys():

        curr_genomic_region = genomic_regions_dict[k]

        # If the object is a GenomicRegionSet
        if isinstance(curr_genomic_region, GenomicRegionSet):

            if args.rmdup:
                # remove duplicates and sort regions
                curr_genomic_region.remove_duplicates(sort=True)
            else:
                # sort regions
                curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            regions_to_match.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if curr_len > max_region_len:
                max_region_len = curr_len
                max_region = curr_genomic_region

    print(">> all files loaded")

    ###################################################################################################
    # Creating random regions
    ###################################################################################################

    # if a random proportion is set, create random regions
    if args.rand_proportion:

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(args.organism, multiply_factor=args.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Add random regions to the list of regions to perform matching on
        regions_to_match.append(rand_region)

        # Writing random regions
        output_file_name = npath(os.path.join(output_location, random_region_name))
        rand_bed_file_name = output_file_name + ".bed"
        rand_region.write(rand_bed_file_name)

        # Verifying condition to write bb
        if args.bigbed:

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            try:
                # Converting to big bed
                bed_to_bb(rand_bed_file_name, chrom_sizes_file)

                # removing previously-created BED file
                os.remove(rand_bed_file_name)
            except Exception:
                err.throw_warning("DEFAULT_WARNING")  # FIXME: maybe error instead?

        print(">> random regions file created:", len(rand_region), "regions")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    if args.motif_dbs:
        ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)
        # filter for dbs only if --motif_dbs is not set
        if 'database' in filter_values:
            del filter_values['database']
    else:
        if 'database' in filter_values:
            ms = MotifSet(preload_motifs=filter_values['database'])
        else:
            ms = MotifSet(preload_motifs="default")

    print(">> used database(s):", ",".join([str(db) for db in ms.motif_data.repositories_list]))

    # applying filtering pattern, taking a subset of the motif set
    if args.filter:
        ms = ms.filter(filter_values, search=args.filter_type)

    motif_list = ms.get_motif_list(args.pseudocounts, args.fpr)

    print(">> motifs loaded:", len(motif_list))

    # Performing normalized threshold strategy if requested
    if args.norm_threshold:
        threshold_list = [motif.threshold / motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list) / len(threshold_list)
    else:
        unique_threshold = None

    scanner = scan.Scanner(7)
    pssm_list = []
    thresholds = []
    for motif in motif_list:
        if unique_threshold:
            thresholds.append(0.0)
            thresholds.append(0.0)
        else:
            thresholds.append(motif.threshold)
            thresholds.append(motif.threshold)

        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

    # Performing motif matching
    # TODO: we can expand this to use bg from sequence, for example,
    # or from organism.
    bg = tools.flat_bg(4)
    scanner.set_motifs(pssm_list, bg, thresholds)

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    print()

    # Iterating on list of genomic region sets
    for grs in regions_to_match:

        start = time.time()
        print(">> matching [", grs.name, "], ", len(grs), " regions... ", sep="", end='')
        sys.stdout.flush()

        # Initializing output bed file
        output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed")

        # must remove it because we append the MPBS
        if os.path.isfile(output_bed_file):
            os.remove(output_bed_file)

        # Iterating on genomic region set
        for genomic_region in grs:

            # Reading sequence associated to genomic_region
            sequence = str(genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

            grs_tmp = match_multiple(scanner, motif_list, sequence, genomic_region)

            # post-processing: if required, remove duplicate regions on opposing strands (keep highest score)
            if len(grs_tmp) > 1 and args.remove_strand_duplicates:
                grs_tmp.sort()
                seqs = grs_tmp.sequences
                seqs_new = []
                cur_pos = 0
                end_pos = len(seqs) - 1
                while cur_pos < end_pos:
                    gr = seqs[cur_pos]

                    new_pos = cur_pos + 1
                    while new_pos < end_pos:
                        gr2 = seqs[new_pos]

                        # if this sequence is unrelated, we move on
                        if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation:
                            break

                        if float(gr.data) < float(gr2.data):
                            gr = gr2

                        new_pos = new_pos + 1

                    # adding the currently-selected genomic region
                    seqs_new.append(gr)

                    # at the next loop, we start from the next right-handed sequences
                    cur_pos = new_pos

                # edge case: the last element was not considered
                # (when it is, cur_pos == end_pos+1)
                if cur_pos == end_pos:
                    seqs_new.append(seqs[cur_pos])

                grs_tmp.sequences = seqs_new

            grs_tmp.write(output_bed_file, mode="a")

        del grs.sequences[:]

        # Verifying condition to write bb
        if args.bigbed and args.normalize_bitscore:
            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            bed_to_bb(output_bed_file, chrom_sizes_file)

            # removing BED file
            os.remove(output_bed_file)

        secs = time.time() - start
        print("[", "%02.3f" % secs, " seconds]", sep="")