def parse_file(self, file, progress_callback=None):
        """
        Parse the file. file can be a filename string or an open file like
        object. The optional progressCallback will be called with a single
        argument to report on the progress.
        """
        if isinstance(file, str):
            if os.path.isfile(file) and tarfile.is_tarfile(file):
                f = tarfile.open(file).extractfile("gene_ontology_edit.obo")
            elif os.path.isfile(file):
                f = open(file)
            elif os.path.isdir(file):
                f = open(os.path.join(file, "gene_ontology_edit.obo"))
            else:
                raise ValueError("Cannot open %r for parsing" % file)
        else:
            f = file

        data = [
            line.decode() if not isinstance(line, str) else line
            for line in f.readlines()
        ]
        data = "".join([line for line in data if not line.startswith("!")])
        self.header = data[:data.index("[Term]")]
        c = re.compile(r"\[.+?\].*?\n\n", re.DOTALL)
        data = c.findall(data)

        milestones = progress_bar_milestones(len(data), 90)
        for i, block in enumerate(builtin_obo_objects + data):
            if block.startswith("[Term]"):
                term = Term(block, self)
                self.terms[term.id] = term
            elif block.startswith("[Typedef]"):
                typedef = Typedef(block, self)
                self.typedefs[typedef.id] = typedef
            elif block.startswith("[Instance]"):
                instance = Instance(block, self)
                self.instances[instance.id] = instance
            if progress_callback and i in milestones:
                progress_callback(90.0 * i / len(data))

        self.alias_mapper = {}
        self.reverse_alias_mapper = defaultdict(set)
        milestones = progress_bar_milestones(len(self.terms), 10)
        for i, (id, term) in enumerate(six.iteritems(self.terms)):
            for type_id, parent in term.related:
                self.terms[parent].related_to.add((type_id, id))
            try:
                self.alias_mapper.update([(alt_id, id)
                                          for alt_id in term.alt_id])
                self.reverse_alias_mapper[id].update(term.alt_id)
            except AttributeError:
                pass
            if progress_callback and i in milestones:
                progress_callback(90.0 + 10.0 * i / len(self.terms))
    def get_enriched_pathways(self,
                              genes,
                              reference=None,
                              prob=statistics.Binomial(),
                              callback=None):
        """
        Return a dictionary with enriched pathways ids as keys
        and (list_of_genes, p_value, num_of_reference_genes) tuples
        as items.

        """
        if reference is None:
            reference = self.genes.keys()
        reference = set(reference)

        allPathways = defaultdict(lambda: [[], 1.0, []])
        milestones = progress_bar_milestones(len(genes), 100)
        pathways_db = KEGGPathways()

        pathways_for_gene = []
        for i, gene in enumerate(genes):
            pathways_for_gene.append(self.pathways([gene]))
            if callback and i in milestones:
                callback(i * 50.0 / len(genes))

        # pre-cache for speed
        pathways_db.pre_cache(
            [pid for pfg in pathways_for_gene for pid in pfg])
        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
            for pathway in pathways:
                if pathways_db.get_entry(pathway).gene:
                    allPathways[pathway][0].append(gene)
            if callback and i in milestones:
                callback(50.0 + i * 50.0 / len(genes))

        pItems = allPathways.items()

        for i, (p_id, entry) in enumerate(pItems):
            pathway = pathways_db.get_entry(p_id)
            entry[2].extend(reference.intersection(pathway.gene or []))
            entry[1] = prob.p_value(len(entry[0]), len(reference),
                                    len(entry[2]), len(genes))
        return dict([(pid, (genes, p, len(ref)))
                     for pid, (genes, p, ref) in allPathways.items()])
    def get_enriched_terms(
            self,
            genes,
            reference=None,
            evidence_codes=None,
            slims_only=False,
            aspect=None,
            prob=statistics.Binomial(),
            use_fdr=True,
            progress_callback=None,
    ):
        """
        Return a dictionary of enriched terms, with tuples of
        (list_of_genes, p_value, reference_count) for items and term
        ids as keys. P-Values are FDR adjusted if use_fdr is True (default).

        :param genes: List of genes
        :param reference: List of genes (if None all genes included in the annotations will be used).
        :param evidence_codes:  List of evidence codes to consider.
        :param slims_only: If `True` return only slim terms.
        :param aspect: Which aspects to use. Use all by default;
                       one of Process (biological process),
                       Function (molecular function) or Component (cellular component)
        :param prob:
        :param use_fdr:
        :param progress_callback:
        """

        all_genes = set(genes)

        if aspect is None:
            aspects_set = {'Process', 'Component', 'Function'}
        elif isinstance(aspect, str):
            aspects_set = {aspect}
        else:
            aspects_set = aspect

        if reference is None:
            reference = self.genes()

        evidence_codes = set(evidence_codes or evidence_dict.keys())
        annotations = [
            ann for gene in genes for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        ]

        ref_annotations = {
            ann
            for gene in reference for ann in self.gene_annotations[gene]
            if ann.evidence in evidence_codes and ann.aspect in aspects_set
        }

        annotations_dict = defaultdict(set)
        for ann in annotations:
            annotations_dict[ann.go_id].add(ann)

        self._ensure_ontology()

        if slims_only and not self.ontology.slims_subset:
            warnings.warn(
                "Unspecified slims subset in the ontology! "
                "Using 'goslim_generic' subset", UserWarning)
            self.ontology.set_slims_subset('goslim_generic')

        terms = annotations_dict.keys()
        filtered_terms = [term for term in terms if term in self.ontology]

        if len(terms) != len(filtered_terms):
            term_diff = set(terms) - set(filtered_terms)
            warnings.warn(
                "%s terms in the annotations were not found in the "
                "ontology." % ",".join(map(repr, term_diff)),
                UserWarning,
            )

        terms = self.ontology.extract_super_graph(filtered_terms)
        res = {}

        milestones = progress_bar_milestones(len(terms), 100)

        for i, term in enumerate(terms):
            if slims_only and term not in self.ontology.slims_subset:
                continue
            all_annotations = self.get_annotations_by_go_id(term).intersection(
                ref_annotations)
            all_annotated_genes = {ann.gene_id for ann in all_annotations}
            mapped_genes = all_genes.intersection(all_annotated_genes)

            if len(reference) > len(all_annotated_genes):
                mapped_reference_genes = reference.intersection(
                    all_annotated_genes)
            else:
                mapped_reference_genes = all_annotated_genes.intersection(
                    reference)

            res[term] = (
                [gene for gene in mapped_genes],
                prob.p_value(len(mapped_genes), len(reference),
                             len(mapped_reference_genes), len(genes)),
                len(mapped_reference_genes),
            )

            if progress_callback and i in milestones:
                progress_callback(100.0 * i / len(terms))

        if use_fdr:
            res = sorted(res.items(), key=lambda x: x[1][1])
            res = {
                id: (genes, p, ref)
                for (id, (genes, _, ref)), p in zip(
                    res, statistics.FDR([p for _, (_, p, _) in res]))
            }
        return res