Beispiel #1
0
def enumerate_inspect_source_code(
        folder,
        file_pattern=".*[.]((py)|(ipynb))$",
        neg_pattern=".*(([-]checkpoint)|(_todo)|(_temp)).*",
        line_patterns="from sklearn[_0-9a-zA-Z.]* import ([_a-zA-Z0-9]+);;import sklearn[.]([_a-z]+)",
        fullname=False):
    """
    Counts groups extracted from source file. We assume all selected files
    can be opened as text files encoded in :epkg:`utf-8` character set.

    @param      folder          folder to dig into
    @param      file_pattern    files to consider
    @param      neg_pattern     negative patterns for filenames
    @param      line_patterns   patterns to look into, separated by ``;;``
    @param      fullname        if True, include the subfolder while checking the regex
    @return                     list of dictionaries
    """
    regs = [re.compile(reg) for reg in line_patterns.split(';;')]
    nb = 0
    for name in explore_folder_iterfile(folder,
                                        pattern=file_pattern,
                                        neg_pattern=neg_pattern,
                                        fullname=fullname):
        nb += 1
        try:
            with open(name, "r", encoding="utf-8", errors='ignore') as f:
                for li, line in enumerate(f):
                    for pi, reg in enumerate(regs):
                        r = reg.search(line)
                        if r:
                            for g in r.groups():
                                obs = dict(group=g, name=name, line=li)
                                obs['patid'] = pi
                                yield obs
        except UnicodeDecodeError as e:
            raise FileNotFoundError(
                "Unable to process '{0}' due to '{1}'.".format(name, e)) from e
    if nb == 0:
        found = os.listdir(folder)
        founds = "\n".join(found) if found else "EMPTY"
        pos_found = list(
            explore_folder_iterfile(folder,
                                    pattern=file_pattern,
                                    fullname=fullname))
        pos_founds = "\n".join(pos_found) if pos_found else "EMPTY"
        mes = "No file found in folder '{0}' with pattern '{1}' (neg='{2}')\n--IN--\n{3}\n--IN--\n{4}"
        raise FileNotFoundError(
            mes.format(folder, file_pattern, neg_pattern, founds, pos_founds))
    def test_pyensae_links(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        this = os.path.join(os.path.dirname(__file__), '..', '..', '_doc',
                            'notebooks')
        checked = 0
        missed = []
        tolook = [
            'http://files.grouplens.org/datasets/movielens/ml-1m.zip',
            'http://www.xavierdupre.fr/', 'url=\\"http',
            '\\"Skin_NonSkin.txt\\", website=\\"https://archive.ics',
            "website='http://telechargement.insee.fr/fichiersdetail",
            'https://archive.ics.uci.edu/ml/machine-learning-databases'
        ]
        for note in explore_folder_iterfile(this,
                                            ".*[.]ipynb$",
                                            ".ipynb_checkpoints",
                                            fullname=True):
            with open(note, 'r', encoding='utf-8') as f:
                content = f.read()
            if "from pyensae.datasource import download_data" in content:
                checked += 1
                found = False
                for to in tolook:
                    if to in content:
                        found = True
                if not found:
                    missed.append(note)
        self.assertGreater(checked, 1)
        self.assertNotEmpty(missed)
    def test_convert_notebooks(self):
        fold = os.path.abspath(os.path.dirname(__file__))
        fold2 = os.path.normpath(
            os.path.join(fold, "..", "..", "_doc", "notebooks"))
        for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
            t = upgrade_notebook(nbf)
            if t:
                fLOG("modified", nbf)
            # remove numbers
            remove_execution_number(nbf, nbf)

        fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests"))
        for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
            t = upgrade_notebook(nbf)
            if t:
                fLOG("modified", nbf)
    def test_pyensae_links(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        this = os.path.join(os.path.dirname(__file__),
                            '..', '..', '_doc', 'notebooks')
        checked = 0
        missed = []
        tolook = ['http://files.grouplens.org/datasets/movielens/ml-1m.zip',
                  'http://www.xavierdupre.fr/',
                  'url=\\"http',
                  '\\"Skin_NonSkin.txt\\", website=\\"https://archive.ics',
                  "website='http://telechargement.insee.fr/fichiersdetail",
                  'https://archive.ics.uci.edu/ml/machine-learning-databases']
        for note in explore_folder_iterfile(this, ".*[.]ipynb$", ".ipynb_checkpoints", fullname=True):
            with open(note, 'r', encoding='utf-8') as f:
                content = f.read()
            if "datasource import download_data" in content or "pyensae.download_data(" in content:
                checked += 1
                found = False
                for to in tolook:
                    if to in content:
                        found = True
                if not found:
                    missed.append(note)
        self.assertGreater(checked, 1)
        self.assertEmpty(missed)
 def test_convert_notebooks(self):
     fLOG(
         __file__,
         self._testMethodName,
         OutputPrint=__name__ == "__main__")
     fold = os.path.abspath(os.path.dirname(__file__))
     fold2 = os.path.normpath(
         os.path.join(fold, "..", "..", "_doc", "notebooks"))
     for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
         t = upgrade_notebook(nbf)
         if t:
             fLOG("modified", nbf)
     fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests"))
     for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
         t = upgrade_notebook(nbf)
         if t:
             fLOG("modified", nbf)
Beispiel #6
0
    def test_convert_notebooks(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")
        fold = os.path.abspath(os.path.dirname(__file__))
        fold2 = os.path.normpath(
            os.path.join(fold, "..", "..", "_doc", "notebooks"))
        for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
            t = upgrade_notebook(nbf)
            if t:
                fLOG("modified", nbf)
            # remove numbers
            remove_execution_number(nbf, nbf)

        fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests"))
        for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
            t = upgrade_notebook(nbf)
            if t:
                fLOG("modified", nbf)
def enumerate_inspect_source_code(folder, file_pattern=".*[.]((py)|(ipynb))$",
                                  neg_pattern=".*(([-]checkpoint)|(_todo)|(_temp)).*",
                                  line_patterns="from sklearn[_0-9a-zA-Z.]* import ([_a-zA-Z0-9]+);;import sklearn[.]([_a-z]+)",
                                  fullname=False):
    """
    Counts groups extracted from source file. We assume all selected files
    can be opened as text files encoded in :epkg:`utf-8` character set.

    @param      folder          folder to dig into
    @param      file_pattern    files to consider
    @param      neg_pattern     negative patterns for filenames
    @param      line_patterns   patterns to look into, separated by ``;;``
    @param      fullname        if True, include the subfolder while checking the regex
    @return                     list of dictionaries
    """
    regs = [re.compile(reg) for reg in line_patterns.split(';;')]
    nb = 0
    for name in explore_folder_iterfile(folder, pattern=file_pattern,
                                        neg_pattern=neg_pattern, fullname=fullname):
        nb += 1
        try:
            with open(name, "r", encoding="utf-8", errors='ignore') as f:
                for li, line in enumerate(f):
                    for pi, reg in enumerate(regs):
                        r = reg.search(line)
                        if r:
                            for g in r.groups():
                                obs = dict(group=g, name=name, line=li)
                                obs['patid'] = pi
                                yield obs
        except UnicodeDecodeError as e:
            raise FileNotFoundError(
                "Unable to process '{0}' due to '{1}'.".format(name, e))
    if nb == 0:
        found = os.listdir(folder)
        founds = "\n".join(found) if found else "EMPTY"
        pos_found = list(explore_folder_iterfile(
            folder, pattern=file_pattern, fullname=fullname))
        pos_founds = "\n".join(pos_found) if pos_found else "EMPTY"
        mes = "No file found in folder '{0}' with pattern '{1}' (neg='{2}')\n--IN--\n{3}\n--IN--\n{4}"
        raise FileNotFoundError(mes.format(
            folder, file_pattern, neg_pattern, founds, pos_founds))
Beispiel #8
0
    def enumerate_group_files(self, group):
        """
        Enumerates all files in a group.

        @param      group       group
        @return                 iterator on files
        """
        if group is None:
            for g in self.Groups:
                for _ in self.enumerate_group_files(g):
                    yield _
        else:
            loc = self.get_group_location(group)
            for _ in explore_folder_iterfile(loc):
                yield _
    def enumerate_group_files(self, group):
        """
        Enumerates all files in a group.

        @param      group       group
        @return                 iterator on files
        """
        if group is None:
            for g in self.Groups:
                for _ in self.enumerate_group_files(g):
                    yield _
        else:
            loc = self.get_group_location(group)
            for _ in explore_folder_iterfile(loc):
                yield _
def execute_python_scripts(root,
                           df,
                           col_names=None,
                           url=None,
                           eol="/",
                           fLOG=noLOG,
                           gen_mail=None):
    """
    Retrieves all :epkg:`python` scripts and run them.

    @param      root            main folder
    @param      df              dataframe
    @param      col_names       dictionary for columns:
                                folder, mail, program, out, err, url, cmp, url_content, key, time
    @param      eol             if not None, replaces end of lines by *eof*
    @param      gen_mail        generator of mails
    @param      fLOG            logging function
    @return                     dataframe
    """
    if gen_mail is None:

        def iter_mail(mail):
            yield mail
            yield mail.lower()

        gen_mail = iter_mail

    def post_process(out, eol):
        out = out.strip("\r\t\n").rstrip().replace("\r",
                                                   "").replace("\t", "    ")
        if eol:
            out = out.replace("\n", eol)
        return out

    downloads = {}
    res = []
    for name, mail in zip(df[col_names.get("folder", "folder")],
                          df[col_names.get("mail", "mail")]):
        row = {col_names.get("folder", "folder"): name}
        fLOG("[execute_python_script], look into '{0}'".format(name))
        subf = os.path.join(root, name)
        col_find = col_names.get("exists", "exists")
        if not os.path.exists(subf):
            subf = os.path.join(root, name.replace("-", "."))
        if not os.path.exists(subf):
            row[col_find] = False
            res.append(row)
        else:
            row[col_find] = True
            store = []
            for py in explore_folder_iterfile(subf, ".*[.]py$"):
                store.append(py)
            fLOG("     -", len(store), "programs found")

            col_out = col_names.get("out", "out")
            col_err = col_names.get("err", "err")
            col_prog = col_names.get("program", "program")
            col_time = col_names.get("time", "time")
            col_key = col_names.get("key", "key")
            col_size = col_names.get("size", "size")
            col_url = col_names.get("url", "url")
            col_ind = col_names.get("pattern_id", "pattern_id")

            if len(store) == 0:
                for mm in sorted(gen_mail(mail.strip())):
                    mailid = _get_code(mm.encode("utf-8"))
                    r = row.copy()
                    loc = url.format(mailid)
                    ind = {col_key: mm, col_ind: mailid, col_url: loc}
                    r.update(ind)
                    res.append(r)
                continue

            # test all programs
            outs = []
            for py in sorted(store):
                cmd = '"{0}" "{1}"'.format(sys.executable, py)
                t1 = time.clock()
                try:
                    out, err = run_cmd(cmd, wait=True)
                except Exception as e:
                    out = None
                    err = str(e)
                out = post_process(out, eol)
                t2 = time.clock()
                outs.append({
                    col_out: out,
                    col_err: post_process(err, eol),
                    col_prog: os.path.split(py)[-1],
                    col_time: t2 - t1,
                    col_size: os.stat(py).st_size
                })

            if url is None:
                for o in outs:
                    r = row.copy()
                    r.update(o)
                    res.append(r)
            elif url is not None:
                col_cmp = col_names.get("cmp", "cmp")
                col_in = col_names.get("sortie_dans_motif",
                                       "sortie_dans_motif")
                col_in2 = col_names.get("motif_dans_sortie",
                                        "motif_dans_sortie")
                col_dist = col_names.get("dist", "dist")
                col_content = col_names.get("content", "content")

                if out is None:
                    for _, mm in gen_mail(mail.strip()):
                        mailid = _get_code(mm.encode("utf-8"))
                        ind = {col_ind: mailid}
                        for o in outs:
                            r = row.copy()
                            r.update(o)
                            r.update(ind)
                            res.append(r)
                else:
                    for mm in sorted(gen_mail(mail.strip())):
                        mailid = _get_code(mm.encode("utf-8"))
                        loc = url.format(mailid)
                        ind = {col_key: mm, col_ind: mailid, col_url: loc}

                        if loc not in downloads:
                            downloads[loc] = get_url_content_timeout(
                                loc).strip("\n\r\t ")
                        content = post_process(downloads[loc], eol)
                        ind[col_content] = content

                        for o in outs:
                            r = row.copy()
                            r.update(o)
                            r.update(ind)
                            out = r[col_out]
                            r[col_cmp] = out == content or out.strip(
                            ) == content.strip()
                            r[col_in] = out in content
                            r[col_in2] = content in out
                            r[col_dist] = (edit_distance(out, content)[0]) if (
                                len(content) > len(out) //
                                2) else abs(len(content) - len(out))
                            res.append(r)
    return pandas.DataFrame(res)
from pyquickhelper.loghelper import fLOG  # publish_lectures
fLOG(OutputPrint=True)

#########################################
# import des fonctions dont on a besoin
from pyquickhelper.filehelper import synchronize_folder, explore_folder_iterfile

########################################
# récupération des répertoires compilés via un serveur jenkins
fLOG("Digging into ", root)

sub = os.path.join("_doc", "sphinxdoc", "build", "html", "index.html")
index = []
pattern = "^index.html$"
done = {}
for name in explore_folder_iterfile(root, pattern):
    if name.endswith(sub):
        pack = name[:len(name) - len(sub) - 1]
        parent, spl = os.path.split(pack)
        if "_UT_" in spl:
            parent, spl = os.path.split(parent)
        if "_UT_" in spl:
            raise ValueError("Something is weird with: '{0}'".format(name))
        index.append((spl, os.path.dirname(name)))
        if spl in done:
            raise ValueError("Duplicated package '{0}'.\n{1}".format(
                spl, "\n".join("{0}={1}".format(k, v)
                               for k, v in sorted(done.items()))))

fLOG("Found {0} directories".format(len(index)))
for ind in index:
from pyquickhelper.loghelper import fLOG  # publish_lectures
fLOG(OutputPrint=True)

#########################################
# import des fonctions dont on a besoin
from pyquickhelper.filehelper import synchronize_folder, explore_folder_iterfile

########################################
# récupération des répertoires compilés via un serveur jenkins
fLOG("Digging into ", root)

sub = os.path.join("_doc", "sphinxdoc", "build", "html", "index.html")
index = []
pattern = "^index.html$"
done = {}
for name in explore_folder_iterfile(root, pattern):
    if name.endswith(sub):
        pack = name[:len(name) - len(sub) - 1]
        parent, spl = os.path.split(pack)
        if "_UT_" in spl:
            parent, spl = os.path.split(parent)
        if "_UT_" in spl:
            raise ValueError("Something is weird with: '{0}'".format(name))
        index.append((spl, os.path.dirname(name)))
        if spl in done:
            raise ValueError("Duplicated package '{0}'.\n{1}".format(
                spl, "\n".join("{0}={1}".format(k, v) for k, v in sorted(done.items()))))

fLOG("Found {0} directories".format(len(index)))
for ind in index:
    fLOG("  ", ind)
Beispiel #13
0
def execute_python_scripts(root, df, col_names=None, url=None, eol="/", fLOG=noLOG, gen_mail=None):
    """
    retrieve all python scripts and run them

    @param      root            main folder
    @param      df              dataframe
    @param      col_names       dictionary for columns:
                                folder, mail, program, out, err, url, cmp, url_content, key, time
    @param      eol             if not None, replaces end of lines by *eof*
    @param      gen_mail        generator of mails
    @param      fLOG            logging function
    @return                     dataframe
    """
    if gen_mail is None:
        def iter_mail(mail):
            yield mail
            yield mail.lower()
        gen_mail = iter_mail

    def post_process(out, eol):
        out = out.strip("\r\t\n").rstrip().replace(
            "\r", "").replace("\t", "    ")
        if eol:
            out = out.replace("\n", eol)
        return out

    downloads = {}
    res = []
    for name, mail in zip(df[col_names.get("folder", "folder")], df[col_names.get("mail", "mail")]):
        row = {col_names.get("folder", "folder"): name}
        fLOG("[execute_python_script], look into '{0}'".format(name))
        subf = os.path.join(root, name)
        col_find = col_names.get("exists", "exists")
        if not os.path.exists(subf):
            subf = os.path.join(root, name.replace("-", "."))
        if not os.path.exists(subf):
            row[col_find] = False
            res.append(row)
        else:
            row[col_find] = True
            store = []
            for py in explore_folder_iterfile(subf, ".*[.]py$"):
                store.append(py)
            fLOG("     -", len(store), "programs found")

            col_out = col_names.get("out", "out")
            col_err = col_names.get("err", "err")
            col_prog = col_names.get("program", "program")
            col_time = col_names.get("time", "time")
            col_key = col_names.get("key", "key")
            col_size = col_names.get("size", "size")
            col_url = col_names.get("url", "url")
            col_ind = col_names.get("pattern_id", "pattern_id")

            if len(store) == 0:
                for mm in sorted(gen_mail(mail.strip())):
                    mailid = _get_code(mm.encode("utf-8"))
                    r = row.copy()
                    loc = url.format(mailid)
                    ind = {col_key: mm, col_ind: mailid, col_url: loc}
                    r.update(ind)
                    res.append(r)
                continue

            # test all programs
            outs = []
            for py in sorted(store):
                cmd = '"{0}" "{1}"'.format(sys.executable, py)
                t1 = time.clock()
                try:
                    out, err = run_cmd(cmd, wait=True)
                except Exception as e:
                    out = None
                    err = str(e)
                out = post_process(out, eol)
                t2 = time.clock()
                outs.append({col_out: out, col_err: post_process(err, eol),
                             col_prog: os.path.split(py)[-1], col_time: t2 - t1,
                             col_size: os.stat(py).st_size})

            if url is None:
                for o in outs:
                    r = row.copy()
                    r.update(o)
                    res.append(r)
            elif url is not None:
                col_cmp = col_names.get("cmp", "cmp")
                col_in = col_names.get(
                    "sortie_dans_motif", "sortie_dans_motif")
                col_in2 = col_names.get(
                    "motif_dans_sortie", "motif_dans_sortie")
                col_dist = col_names.get("dist", "dist")
                col_content = col_names.get("content", "content")

                if out is None:
                    for ii, mm in gen_mail(mail.strip()):
                        mailid = _get_code(mm.encode("utf-8"))
                        ind = {col_ind: mailid}
                        for o in outs:
                            r = row.copy()
                            r.update(o)
                            r.update(ind)
                            res.append(r)
                else:
                    for mm in sorted(gen_mail(mail.strip())):
                        mailid = _get_code(mm.encode("utf-8"))
                        loc = url.format(mailid)
                        ind = {col_key: mm, col_ind: mailid, col_url: loc}

                        if loc not in downloads:
                            downloads[loc] = get_url_content_timeout(
                                loc).strip("\n\r\t ")
                        content = post_process(downloads[loc], eol)
                        ind[col_content] = content

                        for o in outs:
                            r = row.copy()
                            r.update(o)
                            r.update(ind)
                            out = r[col_out]
                            r[col_cmp] = out == content or out.strip(
                            ) == content.strip()
                            r[col_in] = out in content
                            r[col_in2] = content in out
                            r[col_dist] = (edit_distance(out, content)[0]) if (
                                len(content) > len(out) // 2) else abs(len(content) - len(out))
                            res.append(r)
    return pandas.DataFrame(res)
Beispiel #14
0
                outs.append(out)
                print('[csharpy.dotnet] OUT')
                print(out)

        # Copy files.
        from pyquickhelper.filehelper import explore_folder_iterfile
        dest = os.path.join('csharpy', 'binaries', version2)
        if not os.path.exists(dest):
            os.makedirs(dest)
        init = os.path.join(dest, "__init__.py")
        if not os.path.exists(init):
            with open(init, 'w') as f:
                pass
        must_copy = {'DynamicCS': 0, 'CSharPyExtension': 0}
        copied = 0
        for name in explore_folder_iterfile(folder,
                                            pattern='.*[.]((dll)|(so))$'):
            full = os.path.join(folder, name)
            if version2 in full:
                short_name = os.path.split(os.path.splitext(name)[0])[-1]
                if short_name in must_copy:
                    must_copy[short_name] += 1
                copied += 1
                print("[csharpy.copy] '{0}'".format(name))
                shutil.copy(name, dest)
            else:
                # print("[csharpy.skip] '{0}'".format(name))
                pass
        min_must_copy = min(must_copy.values())
        if copied == 0 or min_must_copy == 0:
            raise RuntimeError(
                "Missing binaries in '{0}' for version='{1}'".format(