def get_tokenized_training_data(self):
        tokenized_data = []
        lemmatized_data = []
        print("Started tokenized data ...")
        logging.debug(self.TRAINING_DATA_ROOT_DIRECTORY)
        top_directory = '/home/agniv/Desktop/data-science/telegraph_scraper/'
        top_directory = top_directory + self.TRAINING_DATA_ROOT_DIRECTORY
        with working_directory(top_directory):
            datewise_directories = sorted(os.listdir(top_directory))
            for datewise_directory in datewise_directories:
                datewise_directory = top_directory + datewise_directory
                #logging.debug('DATE: '+datewise_directory)
                with working_directory(datewise_directory):
                    pagewise_directories = sorted(
                        os.listdir(datewise_directory))
                    for pagewise_directory in pagewise_directories:
                        pagewise_directory = datewise_directory + '/' + pagewise_directory
                        #logging.debug('PAGE: '+pagewise_directory)
                        with working_directory(pagewise_directory):
                            newsfiles = sorted(os.listdir(pagewise_directory))
                            for newsfile in newsfiles:
                                #logging.debug('HEADING: '+newsfile)
                                with open(newsfile, "r") as content_file:
                                    file_content = content_file.read()
                                    text = re.sub(r'\W+', ' ', file_content)
                                    all_words = self.tokenize(text, stop_words)
                                    words = []
                                    for word in all_words:
                                        if len(
                                                word
                                        ) < 3:  # logging.debug("Small words not added: "+word)
                                            continue
                                        if re.match(
                                                r'^[0-9]', word
                                        ):  # logging.debug("Words staring with number not added: "+word)
                                            continue
                                        words.append(word)
                                    #for word in words:
                                    tokenized_data.append(words)

        #logging.debug(tokenized_data)
        #logging.debug(len(tokenized_data))
        #lemmatized_data = self.lemmatize(tokenized_data, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        #logging.debug(lemmatized_data)
        #logging.debug(len(lemmatized_data))
        with open('tokenized_data.pkl', 'wb') as f:
            pickle.dump(tokenized_data, f)
        return 'FROM TRAINING DATA'
Beispiel #2
0
 def repack_epub():
     # Assumes pwd is *not* unpack directory.
     msg(f'.. Packing new {epub}.')
     with ZipFile(epub, 'w') as z:
         with working_directory(temp_dir):
             for f in os.listdir('.'):
                 if f in ['..', '.']:
                     continue
                 zip_add(z, f, f)
Beispiel #3
0
def test_eglob():
    with TemporaryDirectory() as path:
        for d in ('one', 'two', 'three', 'four/five', 'six/seven/eight'):
            os.makedirs(os.path.join(path, fix_path(d)))
        for f in ('one/foo.py', 'one/foo.txt', 'two/bar.c', 'four/test.py',
                  'four/test2.py', 'four/me.txt', 'four/five/x.py',
                  'six/seven/test.py'):
            with open(os.path.join(path, fix_path(f)), 'w'):
                pass

        from grizzled.os import working_directory
        with working_directory(path):
            expected = {
                'one/foo.py', 'four/test.py', 'four/test2.py',
                'four/five/x.py', 'six/seven/test.py'
            }
            res = set(eglob('**/*.py'))
            assert (res == expected)
Beispiel #4
0
def test_eglob():
    with TemporaryDirectory() as path:
        for d in ('one', 'two', 'three', 'four/five', 'six/seven/eight'):
            os.makedirs(os.path.join(path, fix_path(d)))
        for f in ('one/foo.py', 'one/foo.txt', 'two/bar.c',
                  'four/test.py', 'four/test2.py', 'four/me.txt',
                  'four/five/x.py', 'six/seven/test.py'):
            with open(os.path.join(path, fix_path(f)), 'w'):
                pass

        from grizzled.os import working_directory
        with working_directory(path):
            expected = {
                'one/foo.py', 'four/test.py', 'four/test2.py',
                'four/five/x.py', 'six/seven/test.py'
            }
            res = set(eglob('**/*.py'))
            assert(res == expected)
Beispiel #5
0
def list_recursively(dir: str, *,
                     include_files: bool = True,
                     include_dirs: bool = True) -> Generator[None, str, None]:
    """
    Recursively list the contents of a directory. Yields the contents of
    the directory and all subdirectories. This method returns a generator,
    so it evaluates its recursive walk lazily. This function is just a
    simple wrapper around `os.walk`.

    Each yielded value is a partial path, relative to the original directory.

    **Parameters**

    - `dir` (`str`): Path to directory to list
    - `include_files` (`bool`): Whether or not to yield directories. `True`
      by default.
    - `include_dirs` (`bool`): Whether or not to yield files. `True` by
      default.

    **Yields**

    partial paths of all directories and/or files below the specified directory

    **Raises**

    `ValueError`: If `dir` does not exist, or if `dir` exists but is not a
                  directory.
    """
    if not _os.path.isdir(dir):
        raise ValueError("{0} is not a directory.".format(dir))

    from grizzled.os import working_directory

    with working_directory(dir):
        for dirpath, dirnames, filenames in _os.walk('.'):
            if include_dirs:
                for d in dirnames:
                    yield _os.path.normpath(_os.path.join(dirpath, d))
            if include_files:
                for f in filenames:
                    yield _os.path.normpath(_os.path.join(dirpath, f))
Beispiel #6
0
    def executecode(self):
     def jsonmayfail(jsono):
       try:
         return json.loads(jsono)
       except ValueError:
         return jsono
     def subst(org):
       return org.replace("__VERSION__", str(self.version))
     def formatoutput(output):
       time = strftime(CLIENTTIMEFMT, gmtime())
       return {"output": output, "time": time}

     output = []
     with working_directory("jwork"):

      self.version += 1;

      realclassname = subst(request.json["classname"])

      cleanup = []
      for f in glob("de/tudarmstadt/botnet/janus_yanai/*") + ["class.jar","dexed.jar","../static/dexed.jar"]:
        try:
          os.remove(f)
        except OSError as e:
          cleanup.append([e.filename, e.strerror])
      if cleanup != []: output.append({"cleanup": cleanup})

      try:
        f = open("de/tudarmstadt/botnet/janus_yanai/" + realclassname + ".java", 'w')
        f.write(subst(request.json["code"]))
        f.close()
      except OSError as e:
        output.append({"write code to file": str(e)})
        return formatoutput(output)

      compileanddex = OrderedDict()
      try:
#        for i in ["javac -cp  ~/Desktop/android-sdk-linux_x86/platforms/android-10/android.jar de/tudarmstadt/botnet/janus_yanai/*", "jar cf class.jar de", "~/Desktop/android-sdk-linux_x86/platform-tools/dx --dex --output dexed.jar class.jar"]:
        for i in ["javac -cp  ~/Downloads/android-sdk-linux/platforms/android-10/android.jar de/tudarmstadt/botnet/janus_yanai/*", "jar cf class.jar de", "~/Downloads/android-sdk-linux/platform-tools/dx --dex --output dexed.jar class.jar"]:
          o = six.u(check_output(i, stderr=subprocess.STDOUT, shell=True))
          if o != "":
            compileanddex.update(i, o)
      except subprocess.CalledProcessError as e:
        output.append({"compile and dex": compileanddex})
        output.append({"stdout": e.output})
        output.append({"error": str(e)})
        return formatoutput(output)
      if compileanddex != {}: output.append({"compile and dex": compileanddex})

      os.symlink(os.path.join(os.getcwd(), "dexed.jar"), "../static/dexed.jar")
      ip = self.statuses[request.json["serial"]].ip
      port = self.statuses[request.json["serial"]].port
      if ip == "127.0.0.1":
        myip = "10.0.2.2"
      else:
        myip = six.u(check_output("ifconfig")).split("\n")[1].split()[1][5:]
      st = self.sendbotcmd(ip, port, "download http://" + myip + ":" + str(cherrypy.config["server.socket_port"]) + "/static/dexed.jar")
      output.append({"downloading": jsonmayfail(st)})
      if (st == "timed out"):
        return formatoutput(output)
      st = self.sendbotcmd(ip, port, "run de.tudarmstadt.botnet.janus_yanai." + realclassname)
      output.append({"running": jsonmayfail(st)})
      return formatoutput(output)
Beispiel #7
0
def fix_epub(epub, book_title, temp_dir):
    '''
    Make some adjustments to the generated tables of contents in the ePub,
    removing empty elements and removing items matching the book title.

    Parameters:

    epub:       The path to the epub file
    book_title: The book title
    temp_dir:   Temporary directory to use for unpacking
    '''
    from zipfile import ZipFile, ZIP_DEFLATED
    from xml.dom import minidom
    from grizzled.os import working_directory

    rm_rf(temp_dir, silent=True)

    def zip_add(zf, path, zippath):
        '''Swiped from zipfile module.'''
        if os.path.isfile(path):
            zf.write(path, zippath, ZIP_DEFLATED)
        elif os.path.isdir(path):
            if zippath:
                zf.write(path, zippath)
            for nm in os.listdir(path):
                zip_add(zf, os.path.join(path, nm), os.path.join(zippath, nm))

    def unpack_epub():
        # Assumes pwd is *not* unpack directory.
        msg(f'.. Unpacking {epub}.')
        with ZipFile(epub) as z:
            z.extractall(temp_dir)

    def repack_epub():
        # Assumes pwd is *not* unpack directory.
        msg(f'.. Packing new {epub}.')
        with ZipFile(epub, 'w') as z:
            with working_directory(temp_dir):
                for f in os.listdir('.'):
                    if f in ['..', '.']:
                        continue
                    zip_add(z, f, f)

    def strip_text_children(element):
        for child in element.childNodes:
            if type(child) == minidom.Text:
                element.removeChild(child)

    def get_text_children(element):
        text = None
        if element:
            s = ''
            for child in element.childNodes:
                if child and (type(child) == minidom.Text):
                    s += child.data.strip()
            text = s if s else None
        return text

    def fix_toc_ncx(toc):
        # Assumes pwd *is* unpack directory
        msg(f'.. Reading table of contents file "{toc}".')
        with open(toc) as f:
            toc_xml = f.read()

        msg('.. Adjusting table of contents.')
        with minidom.parse(toc) as dom:
            nav_map = dom.getElementsByTagName('navMap')
            if not nav_map:
                abort('Malformed table of contents: No <navMap>.')
            nav_map = nav_map[0]
            for p in nav_map.getElementsByTagName('navPoint'):
                text_nodes = p.getElementsByTagName('text')
                text = None
                if text_nodes:
                    text = get_text_children(text_nodes[0])

                if (not text) or (text == book_title):
                    nav_map.removeChild(p)

            # Renumber the nav points.
            for i, p in enumerate(nav_map.getElementsByTagName('navPoint')):
                num = i + 1
                p.setAttribute('id', f'navPoint-{num}')

            # Strip any text nodes from the navmap.
            strip_text_children(nav_map)

            # Write it out.
            with open(toc, 'w') as f:
                dom.writexml(f)

    def fix_nav_xhtml(toc):
        # Assumes pwd *is* unpack directory
        msg(f'.. Reading table of contents file "{toc}".')
        with open(toc) as f:
            toc_xml = f.read()

        msg('.. Adjusting table of contents.')
        with minidom.parse(toc) as dom:
            navs = dom.getElementsByTagName('nav')
            nav = None
            for n in navs:
                if not n.hasAttributes():
                    continue
                a = n.attributes.get('id')
                if not a:
                    continue
                if a.value == 'toc':
                    nav = n
                    break
            else:
                abort('Malformed table of contents: No TOC <nav>.')

            ol = nav.getElementsByTagName('ol')
            if (not ol) or (len(ol) == 0):
                abort('Malformed table of contents: No list in <nav>.')
            ol = ol[0]

            for li in ol.getElementsByTagName('li'):
                a = li.getElementsByTagName('a')
                if not a:
                    abort('Malformed table of contents: No <a> in <li>.')
                a = a[0]
                text = get_text_children(a)
                if (not text) or (text == book_title):
                    ol.removeChild(li)

            # Renumber the list items
            for i, li in enumerate(ol.getElementsByTagName('li')):
                num = i + 1
                li.setAttribute('id', f'toc-li-{num}')

            # Strip any text nodes from the ol.
            strip_text_children(ol)

            # Write it out.
            with open(toc, 'w') as f:
                dom.writexml(f)

    # Main logic
    try:
        unpack_epub()
        with ensure_dir(temp_dir):
            with working_directory(temp_dir):

                for toc, func in (('toc.ncx', fix_toc_ncx), ('nav.xhtml',
                                                             fix_nav_xhtml)):
                    if not os.path.exists(toc):
                        msg(f'.. No {toc} file. Skipping it.')
                        continue
                    func(toc)

        repack_epub()
    finally:
        #rmtree(temp_dir)
        pass
Beispiel #8
0
def piconfig(tag,filename):#function to go in directory and pass arg file to piconfig
    with working_directory(str(os.path.join(os.getcwd(),tag))):
        os.system('piconfig < '+ filename +'.txt')
    def get_tokenized_test_data(self, id2word):
        logging.debug('STARTED GET_TOKENIZED_TEST_DATA')
        temp_file = datapath("saved-model")
        saved_lda_model = models.LdaModel.load(temp_file)
        tokenized_data = []
        #lemmatized_data = []
        print("Started tokenized data ...")
        logging.debug(self.TEST_DATA_ROOT_DIRECTORY)
        top_directory = '/home/agniv/Desktop/data-science/telegraph_scraper/'
        top_directory = top_directory + self.TEST_DATA_ROOT_DIRECTORY
        with working_directory(top_directory):
            datewise_directories = sorted(os.listdir(top_directory))
            for datewise_directory in datewise_directories:
                datewise_directory = top_directory + datewise_directory
                #logging.debug('DATE: '+datewise_directory)
                with working_directory(datewise_directory):
                    pagewise_directories = sorted(
                        os.listdir(datewise_directory))
                    for pagewise_directory in pagewise_directories:
                        pagewise_directory = datewise_directory + '/' + pagewise_directory
                        #logging.debug('PAGE: '+pagewise_directory)
                        with working_directory(pagewise_directory):
                            newsfiles = sorted(os.listdir(pagewise_directory))
                            for newsfile in newsfiles:
                                #logging.debug('HEADING: '+newsfile)
                                with open(newsfile, "r") as content_file:
                                    file_content = content_file.read()
                                    text = re.sub(r'\W+', ' ', file_content)
                                    all_words = self.tokenize(text, stop_words)
                                    words = []
                                    for word in all_words:
                                        if len(
                                                word
                                        ) < 3:  # logging.debug("Small words not added: "+word)
                                            continue
                                        if re.match(
                                                r'^[0-9]', word
                                        ):  # logging.debug("Words staring with number not added: "+word)
                                            continue
                                        words.append(word)
                                        #logging.debug(words)
                                        #print(words)
                                    bow = id2word.doc2bow(words)
                                    sorted_topic_list = sorted(
                                        saved_lda_model[bow],
                                        key=lambda x: x[1],
                                        reverse=True)
                                    top_topic = sorted_topic_list[:1]
                                    (idx, value) = top_topic[0]
                                    top_topic_str = str(
                                        saved_lda_model.print_topic(idx, 5))
                                    top_topic_keywords = re.findall(
                                        r'"([^"]*)"', top_topic_str)
                                    top_topic_probabilities = re.findall(
                                        "\d+\.\d+", top_topic_str)
                                    logging.debug('FILENAME: ' +
                                                  pagewise_directory + '/' +
                                                  newsfile)
                                    print('FILENAME: ' + pagewise_directory +
                                          '/' + newsfile)
                                    logging.debug('TOPICS: ' +
                                                  str(top_topic_keywords))
                                    print('TOPICS: ' + str(top_topic_keywords))
                                    logging.debug('TOPIC PROBABILITIES: ' +
                                                  str(top_topic_probabilities))
                                    print('TOPIC PROBABILITIES: ' +
                                          str(top_topic_probabilities))
                                    #for word in words:
                                    #tokenized_data.append(words)

        #logging.debug(tokenized_data)
        #logging.debug(len(tokenized_data))
        #lemmatized_data = self.lemmatize(tokenized_data, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        #logging.debug(lemmatized_data)
        #logging.debug(len(lemmatized_data))
        #with open('tokenized_data.pkl', 'wb') as f:
        #   pickle.dump(tokenized_data, f)
        return 'FROM TEST DATA'
Beispiel #10
0
from grizzled.os import working_directory
with working_directory('..'):
    from critter import Critter
    from food import Food
    from world import World
    from biology import BioAssumptions


class ScaryPredator(Critter):
    DESCRIPTION = "A critter than can eat other critters"
    pass