def perform_coverage_delete(self, files):
     # perform coverage delete
     cleaner = Cleaner('pass # pragma: no cover\n')
     for filename in files.keys():
         content = self.read(filename)
         content = cleaner.clean(content, files[filename])
         self.write(filename, content)
Beispiel #2
0
    def __init__(self):
        self.cleaner = Cleaner()
        self.maxlen = 900
        self.tokenizer = None
        self.parent_path = Path(__file__).parent.parent

        with open(
                self.parent_path /
                'data/neural_network_config/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)

        # load json and create model
        json_model_keras = open(
            self.parent_path / 'data/neural_network_config/model.json', 'r')
        loaded_model_json = json_model_keras.read()
        json_model_keras.close()
        self.loaded_model = model_from_json(loaded_model_json)

        # load weights into new model
        self.loaded_model.load_weights(self.parent_path /
                                       "data/neural_network_config/model.h5")

        # evaluate loaded model on test data
        self.loaded_model.compile(optimizer='adam',
                                  loss='mean_squared_error',
                                  metrics=['mae', 'accuracy'])
class Tips(object):
    """ Manage Tips Events. """

    def __init__(self, enable):
        self.enable = enable
        self._tips = {}
        self._new_tips = set()
        self.lock = Lock()
        if self.enable:
            self.fetcher = Fetcher(self._tips, self.lock, self._new_tips)
            self.cleaner = Cleaner(self._tips, self.lock, self._new_tips)
            self.fetcher.start()
            self.cleaner.start()

    def tips(self):
        return self._tips.values()

    def new_tips(self):
        if self._new_tips:
            wait_free_acquire(self.lock)
            res = [self._tips[x] for x in self._new_tips]
            self._new_tips.clear()
            self.lock.release()
            return res
        else:
            return []

    def stop(self):
        if self.enable:
            self.fetcher.finnish()
            self.cleaner.finnish()
Beispiel #4
0
 def __init__(self):
     self.lda_model = None
     self.dictionary = None
     self.n_topic = None
     self.n_passes = None
     self.cleaner = Cleaner()
     self.folder_name = "models"
    def play(self):
        '''
        '''

        Cleaner.clean_wait_begin(self, self.check_begin)
        self.init_slot_descriptions()
        #change the picture
        self.background_wait_begin = Picture(self.app,
                                             image=PATH_RESSOURCES +
                                             "background_play.jpg",
                                             grid=[0, 0, 40, 40])
        response = str(
            requests.get(BASE_URL + '/play/round/1/hand/' +
                         self.token).content)[2:-1]
        datas = json.loads(response)
        self.cards = []
        idx_pos = 1
        for data in datas:
            self.cards.append(
                PushButton(self.app,
                           image=PATH_RESSOURCES + "cards_min/card_" +
                           data["image"] + "_min.png",
                           command=self.show_card,
                           args=[data],
                           grid=[idx_pos, 35]))
            idx_pos += 1
Beispiel #6
0
def main(endpoint_models: list):
    """ Loop over endpoint models, search file directory for endpoints
        and compare to a standard endpoint config.
    """
    audit_list = list()
    for model in tqdm(endpoint_models, desc="Looping over endpoint models..."):
        # first get the gold standard config file
        standard_config_file = get_standard_config(model)

        # now open that standard file
        with open(standard_config_file, 'r') as standard_config:
            standard_config_json = json.load(standard_config)

        audit = Parser(standard_config_json)

        # gather endpoint filenames
        endpoint_config_files = gather_endpoints(model)

        for endpoint in tqdm(endpoint_config_files,
                             desc="Looping over endpoint config files..."):
            with open(endpoint, 'r') as endpoint_file:
                endpoint_json = json.load(endpoint_file)

            config_diff = audit.compare(endpoint_json, endpoint.name)

            cleaner = Cleaner(config_diff)
            cleaned = cleaner.clean()
            audit_list.append({f"{model}": cleaned})
    return audit_list
 def test_metadata(self):
     e = Cleaner('./tests/data/models/meta_2200_model.pkl')
     e2 = Cleaner('./tests/data/models/72000_model.pkl')
     metadata = e.metadata()
     no_metadata = e2.metadata()
     self.assertTrue(metadata)
     self.assertFalse(no_metadata)
Beispiel #8
0
class TopicModeler:
    def __init__(self, **kwargs):
        '''
        kwargs::

        class gensim.models.ldamodel.LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, callbacks=None, dtype=<class 'numpy.float32'>)¶
        '''
        self.cleaner = Cleaner()
        self.lda_model = None
        self.lda_kwargs = kwargs
        self.dictionary = None

    def update(self, docs):
        cleaned = [list(self.cleaner.clean(doc)) for doc in docs]
        self.dictionary = corpora.Dictionary(cleaned)
        corpus = [self.dictionary.doc2bow(text) for text in cleaned]

        if self.lda_model is None:
            self.lda_model = models.ldamodel.LdaModel(corpus,
                                                      id2word=self.dictionary,
                                                      **self.lda_kwargs)
        else:
            self.lda_model.update(corpus, id2word=self.dictionary)

    def classify(self, doc):
        bow = self.dictionary.doc2bow(list(self.cleaner.clean(doc)))
        topic = max(self.lda_model.get_document_topics(bow),
                    key=lambda x: x[1])[0]
        return self.lda_model.show_topic(topic)

    def print_topics(self):
        print(self.lda_model.print_topics(num_topics=10, num_words=3))
Beispiel #9
0
    def format_text(self, text, document, formatting):
        par_len = 150

        applicator = Applicator()
        cleaner = Cleaner()
        formatter = Formatter()
        new_text = self.get_string(text)

        cleaner.create_sentc_list(new_text)
        sentc_list = cleaner.get_sentc_list()
        formatter.set_sentlist(sentc_list)

        if formatting == "list":
            formatter.frmt_textlist()

        elif formatting == "block":
            formatter.frmt_textblock(par_len=par_len)

        elif formatting == "string":
            formatter.frmt_textstring()

        else:
            print("format not supported")
            raise SystemExit

        format_text = formatter.get_text()
        applicator.apply_text(format_text, document=document)
    def faction(self):
        '''
        '''

        Cleaner.clean_wait_player(self, self.check_players)
        self.background_faction = Picture(self.app,
                                          image=PATH_RESSOURCES +
                                          "background_faction.png",
                                          grid=[0, 0, 20, 20])
        self.button_faction_1 = PushButton(self.app,
                                           image=PATH_RESSOURCES +
                                           "faction1.png",
                                           command=self.choose_faction,
                                           args=[1],
                                           grid=[7, 12])
        self.button_faction_2 = PushButton(self.app,
                                           image=PATH_RESSOURCES +
                                           "faction2.png",
                                           command=self.choose_faction,
                                           args=[2],
                                           grid=[8, 12])
        self.button_faction_3 = PushButton(self.app,
                                           image=PATH_RESSOURCES +
                                           "faction3.png",
                                           command=self.choose_faction,
                                           args=[3],
                                           grid=[9, 12])
        self.button_faction_4 = PushButton(self.app,
                                           image=PATH_RESSOURCES +
                                           "faction4.png",
                                           command=self.choose_faction,
                                           args=[4],
                                           grid=[10, 12])
    def show_card(self, card):
        '''
        '''

        if self.card_img_big is not None:
            Cleaner.clean_slot_descriptions(self)
        self.card_img_big = Picture(self.app,
                                    image=PATH_RESSOURCES + "cards/card_" +
                                    card["image"] + ".png",
                                    grid=[30, 5, 7, 19])
        self.name_card = Text(self.app,
                              text=card["name"],
                              font="Impact",
                              color="white",
                              size=20,
                              align="left",
                              grid=[30, 25, 10, 1])
        self.description_card = Text(self.app,
                                     text="\"" +
                                     self.cut_line(card["description"]) + "\"",
                                     font="Impact",
                                     color="white",
                                     size=18,
                                     align="left",
                                     grid=[30, 27, 10, 3])
        self.description_power_card = Text(self.app,
                                           text=self.cut_line(
                                               card["description_power"]),
                                           font="Impact",
                                           color="white",
                                           size=18,
                                           align="left",
                                           grid=[30, 31, 10, 3])
Beispiel #12
0
 def __init__(self):
     self.initLogger()
     self.examiner = Examiner()
     self.sec = Secretary()
     self.clr = Cleaner()
     self.login()
     self.init()
 def test_to_submission_format(self):
     e = Cleaner('./tests/data/models/ae3_213750_model.pkl')
     img = '../data/test/10.png'
     img, id = e.clean(img)
     csv = e.to_submission_format(img, id)
     row = csv[300].split(',')
     self.assertEqual(row[0], '%s_%d_%d' % (id, 1, 301))
     self.assertTrue(float(row[1]) <= 1.0)
Beispiel #14
0
 def __init__(self):
     self.initLogger()
     self.examiner = Examiner()
     self.sec = Secretary()
     self.clr = Cleaner()
     self.questionsDb = QuestionsDb('XFQuestionsLib.db')
     self.login()
     self.init()
Beispiel #15
0
def do(task):
    logging.debug("Start doing task: %s" % task)
    cleaner = Cleaner()
    try:
        return cleaner.clean(task)
    except:
        traceback.print_exc(file=sys.stderr)
        logging.critical('Failed while cleaning for task %s' % (task['ID']))
        return False
Beispiel #16
0
 def build(self):
     '''build the database'''
     reddit = Reddit()
     cleaner = Cleaner()
     for subreddit in reddit.get_subreddits():
         for post in reddit.get_posts(subreddit):
             self.database.insert(cleaner.clean(post))
             for comment in reddit.get_comments(post):
                 self.database.insert(cleaner.clean(comment))
Beispiel #17
0
 def __init__(self):
     self.__crawler = Crawler()
     self.__cleaner = Cleaner()
     self.__file_manager = FileManager()
     self.__search_engine = GoogleSearch(config.SEARCH_TOPIC,
                                         config.MAX_ITEM,
                                         config.NUMBER_OF_RESULTS_PER_PAGE,
                                         config.PAUSE_BTW_REQUEST)
     self.__csf_manager = CSFManager()
 def __init__(self):
     self.config_parser = ConfigParser()
     self.out_dir = os.path.join(os.path.dirname(__file__), "backups/")
     self.create_empty_dir()
     self.backup_name = "%s-%s.zip" % (os.getlogin(), time.strftime("%d-%m-%Y"))
     self.directorys = self.config_parser.directories_to_backup
     self.path = os.path.join(self.out_dir, self.backup_name)
     self.zip_creator = ZipCreator(self.path, self.directorys)
     self.drive_connector = DriveConnector(self.out_dir, self.config_parser)
     self.cleaner = Cleaner(self.out_dir, self.config_parser.get_clean_time())
Beispiel #19
0
    def __init__(self, **kwargs):
        '''
        kwargs::

        class gensim.models.ldamodel.LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, callbacks=None, dtype=<class 'numpy.float32'>)¶
        '''
        self.cleaner = Cleaner()
        self.lda_model = None
        self.lda_kwargs = kwargs
        self.dictionary = None
    def wait_begin(self):
        '''
        '''

        Cleaner.clean_choose_faction(self)
        #change the picture
        self.background_wait_begin = Picture(self.app,
                                             image=PATH_RESSOURCES +
                                             "background_waiting.png",
                                             grid=[0, 0, 20, 20])
        self.background_wait_begin.repeat(1000, self.check_begin)
    def test_cleaner_erase_given_data(self):
        """ Checks if it is possible to delete a file

        Notes:
             58 is hard-coded. DO NOT CHANGE
        """
        file = open('plushkin1', 'w+')
        file.close()
        file = open('plushkin2', 'w+')
        file.close()
        cc = Cleaner(['plushkin1', 'plushkin2'], 0)
        self.assertEqual(cc.clean_and_report(), ([], 1, 58, 0))
def collect_all_data(cls, df):

    judges = df.copy()
    #
    for plaintext in judges.plain_text:

        top_split, bottom_split = Cleaner.splitter(plaintext)
        bad_names = Cleaner._unclean_names(top_split)
        good_names = Cleaner._clean_names(bad_names)
        author = Cleaner._clean_author(bottom_split)

        yield list(good_names), author
Beispiel #23
0
def cleanupTask(task):
    # cleanup task at hand

    # ----------------------------------------------------------------------------------------------
    # Get all parameters for the production
    # ----------------------------------------------------------------------------------------------
    cleaner = Cleaner(task)
    cleaner.logCleanup()

    print ''

    return
Beispiel #24
0
    def display_db(self):
        '''
        ***not meant to stay***
        display everything in database
        '''
        cleaner = Cleaner()
        for post in self.database.posts.find():
            loaded = cleaner.load(post)
            pprint.pprint(loaded.data)

        for comment in self.database.comments.find():
            loaded = cleaner.load(comment)
            pprint.pprint(loaded.data)
Beispiel #25
0
def performance_modelo(dataset, model):
    Sentenca = dataset.iloc[:, 1]
    Intencoes = dataset.iloc[:, 0]
    cleaner = Cleaner()
    Sentenca_cleaned = [cleaner.clean_text(x) for x in Sentenca]
    Sentenca_counts = CountVectorizer().fit_transform(Sentenca_cleaned)
    X_train, X_test, y_train, y_test = train_test_split(Sentenca_counts,
                                                        Intencoes,
                                                        test_size=0.15,
                                                        random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy_score_model = accuracy_score(y_test, y_pred)
    return print(f"A acurácia do model é de {accuracy_score_model*100:.1f}%")
Beispiel #26
0
    def __init__(self):
        """
        Description
        -----------
            Sets our default variable path.
        """

        self.dir_to_watch = os.getenv("DIR_TO_WATCH")
        self.destination_dir = os.getenv("DESTINATION_DIR")

        self.logger = Logger()

        self.logger.write(f'Automated Maid', figlet=True)
        self.cleaner = Cleaner()
Beispiel #27
0
def main(dataset, model):
    Sentenca = dataset.iloc[:, 1]
    Intencoes = dataset.iloc[:, 0]
    cleaner = Cleaner()
    Sentenca_cleaned = [cleaner.clean_text(x) for x in Sentenca]
    vectorizer = CountVectorizer()
    Sentenca_counts = vectorizer.fit_transform(Sentenca_cleaned)
    model.fit(Sentenca_counts, Intencoes)
    print("Digite um comando:")
    nova_sentenca = input()
    nova_sentenca_clean = cleaner.clean_text(nova_sentenca)
    #nova_sentenca = "liga a luz"
    counts_da_nova_sentenca = vectorizer.transform(
        [cleaner.clean_text(nova_sentenca_clean)])
    interpretacao_sentenca(counts_da_nova_sentenca, nova_sentenca)
Beispiel #28
0
 def create_db(self) -> None:
     """Create database, collect data and insert them."""
     print("creating tables...")
     Base.metadata.create_all(self.engine)
     print("tables created")
     print("uploading data from api...")
     collector = Collector()
     data = collector.collect()
     cleaner = Cleaner()
     data_cleaned = cleaner.cleaner(data)
     print("upload successful")
     print("adding data to tables...")
     installer = Installer()
     installer.install(data_cleaned, self.engine)
     print("database install with success")
Beispiel #29
0
def infer():
    if request.method == "POST":

        body = json.loads(request.data.decode("utf-8"))
        text = body.get("text", "")

        if text == "":
            return Jsonify(
                result=""
            )

        cleaner = Cleaner()
        text = clean_text(cleaner, text)

        tokenizer = joblib.load(os.path.join(MODELS,"tokenizer.pkl"))
        sequence = tokenizer.texts_to_sequences([text])
        test = pad_sequences(
            sequence, maxlen=max_len
        )

        _, model = create_BiLSTMRNN()
        model.load_weights(
            os.path.join(MODELS, 'BiLSTM.hdf5'))
        return jsonify(
            result= sentiment[
                np.around(model.predict(test), decimals=0).argmax(axis=1)[0]]
        )
    else:
        return jsonify(
            result="POST API call is required"
        )
Beispiel #30
0
def process_html_page(coredb, driver, page, config, locks):
    source = get_clean_source(driver.page_source)
    text_content_dirty = driver.find_element_by_tag_name("body").text
    text_content = Cleaner.clean_all(text_content_dirty)
    text_content_hash = get_source_hash(text_content)
    duplicate_page = coredb.get_page_with_hash(text_content_hash)
    if duplicate_page is not None:
        logging.debug('Duplicate page found with url: ' + duplicate_page['url'])
        # TODO: add column to db to list which page it's duplicate off
        coredb.update_page(page['id'], PageType.DUPLICATE.value, 200, None, None, duplicate_page['id'])
        return

    if False: # dont add new links
        links = get_links_from_page(driver, source)
        for url in links:
            handle_new_link(coredb, config, url, page['id'], locks)

    #imgs = driver.find_elements_by_xpath('//img[@src]')
    #img_srcs = set([img.get_attribute('src') for img in imgs])
    #for img_src in img_srcs:
        #if ',' in img_src:
            #continue  # for example svg+xml, ....
        #handle_new_image(coredb, page['id'], img_src)

    coredb.update_page(page['id'], PageType.HTML.value, 200, text_content, text_content_hash)
Beispiel #31
0
class DomesticViolenceClassifier:
    def __init__(self):
        self.cleaner = Cleaner()
        self.maxlen = 900
        self.tokenizer = None
        self.parent_path = Path(__file__).parent.parent

        with open(
                self.parent_path /
                'data/neural_network_config/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)

        # load json and create model
        json_model_keras = open(
            self.parent_path / 'data/neural_network_config/model.json', 'r')
        loaded_model_json = json_model_keras.read()
        json_model_keras.close()
        self.loaded_model = model_from_json(loaded_model_json)

        # load weights into new model
        self.loaded_model.load_weights(self.parent_path /
                                       "data/neural_network_config/model.h5")

        # evaluate loaded model on test data
        self.loaded_model.compile(optimizer='adam',
                                  loss='mean_squared_error',
                                  metrics=['mae', 'accuracy'])

    #returns the sentiment of a text string
    def domestic_violence_subject_probability(self, text: str):
        x = self.tokenizer.texts_to_sequences([self.cleaner.clean_text(text)])
        x = pad_sequences(x, padding='post', maxlen=self.maxlen)

        y_new = self.loaded_model.predict(x)
        return y_new[0][0]
Beispiel #32
0
def main():

    #istantiate all the useful classes
    linker=EntityLinker()
    cleaner=Cleaner()
    sentiment=Sentiment()
    utility= Utility()
 def process(self):
     maxlen, chars, x, y, text, chars, char_indices, indices_char, next_chars = Cleaner(
     ).mainClean()
     model = self.createModel(maxlen, chars)
     print_callback = LambdaCallback(
         on_epoch_end=self.on_epoch_end(model, self.on_epoch_end(
         ), text, maxlen, chars, char_indices, indices_char))
     ModelTrainer().main(print_callback, model, x, y)
 def __init__(self, enable):
     self.enable = enable
     self._tips = {}
     self._new_tips = set()
     self.lock = Lock()
     if self.enable:
         self.fetcher = Fetcher(self._tips, self.lock, self._new_tips)
         self.cleaner = Cleaner(self._tips, self.lock, self._new_tips)
         self.fetcher.start()
         self.cleaner.start()
Beispiel #35
0
def clean_invalid_glyphs_and_remove_hinting(fontfile, hinting, output):
  whitespace_and_ignorable_list = get_whitespace_and_ignorable_list()
  cleaner = Cleaner(fontfile, hinting, whitespace_and_ignorable_list)
  cleaner.clean()
  # Flatten cmap format 4 (no idRangeOffset/glyphIdArray) so it is a simple 
  # subset of format 12.
  change_method(_c_m_a_p.cmap_format_4,_cmap_format_4_compile, 'compile')
  cleaner.save(output)
  cleaner.close()
Beispiel #36
0
def sqliteCustomer():
    sqlite = Sqlite()
    sqlite.createTable()     #创建表
    cl = Cleaner(sqlite)
    
    while 1:
        if not gl.QTFLAG:    #退出检测
            gl.DCFLAG = False
            break
        try:
            data = gl.MYQ.get()
            if data[0]==3:
                addImg(sqlite,data)
            elif data[0]==7:
                print 'clear'
                cl.cleanOTImg()
        except Queue.Empty:
            time.sleep(1)
        except Exception,e:
            logger.error(str(e))
            raise
            gl.TRIGGER.emit("<font %s>%s</font>"%(gl.style_red,getTime()+str(e)))
Beispiel #37
0
def clean_invalid_glyphs_and_remove_hinting(fontfile, hinting, output, verbose):
    whitespace_and_ignorable_list = get_whitespace_and_ignorable_list()
    cleaner = Cleaner(fontfile, hinting, whitespace_and_ignorable_list)
    cleaner.clean(verbose)
    # Flatten cmap format 4 (no idRangeOffset/glyphIdArray) so it is a simple
    # subset of format 12.
    # do we still what this?
    change_method(_c_m_a_p.cmap_format_4, _cmap_format_4_compile, "compile")
    old_12_or_13_compile = change_method(_c_m_a_p.cmap_format_12_or_13, _cmap_format_12_or_13_compile, "compile")
    cleaner.save(output)
    cleaner.close()
    change_method(_c_m_a_p.cmap_format_12_or_13, old_12_or_13_compile, "compile")
    def test_clean(self):
        img = config.data_dir_path + 'test/10.png'
        PIL.Image.open(img).show()

        e = Cleaner('./tests/data/models/13_72000_model.pkl')
        e.clean_and_show(img)

        e = Cleaner('./tests/data/models/ae3_213750_model.pkl')
        e.clean_and_show(img)
        pass
Beispiel #39
0
 def __init__(self, config={}):
     Cleaner.__init__(self, config)
     self.repo = git.Repo(self.cwd)
     self.remote = getattr(self.repo.remotes, self.remote) #or remote/whatev
Beispiel #40
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new= record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))


            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')


            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
from cleaner import Cleaner

cleaner = Cleaner()
cleaner.run()
Beispiel #42
0
from feeder import FeedDownloader
from webhelper import WebHelper
from cleaner import Cleaner

from threading import Thread

import time

t1 = time.time()

fd = FeedDownloader("http://mybroadband.co.za/news/feed", "My Broadband")
wh = WebHelper()
cleaner = Cleaner()

articles = fd.parse()

print time.time() - t1, "setup complete"
t1 = time.time()

wh.get_html_threaded(articles)
print time.time() - t1, "threaded download complete"

t1 = time.time()
for a in articles:
    a.html = wh.attempt_get_html(a.url)
print time.time() - t1, "non threaded download complete"

t1 = time.time()
for a in articles:
    if a.html:
        a.plaintext = cleaner.clean(a.html)
Beispiel #43
0
	def clean(self, base, file_base):
		cleaner = Cleaner(logger=self.logger, options=self.options)
		cleaner.handle_aux(base, file_base)
Beispiel #44
0
def main():
    parser = OptionParser(prog="reaper",
                          version="0.1.0",
                          usage="%prog [options] <path to folder> "+
                          "[<path to folder...>]",
                          description="PyReaper is a small tool that detects " + 
                          "duplicated files by hashing them and then deletes " + 
                          "these duplicated files leaving just one of them", 
                          epilog="CAUTION: handle with EXTREME CARE, " + 
                          "use -n option first if you are not sure of " + 
                          "what are you doing, this thing deletes stuff!!!")
    parser.add_option("-n",
                      "--no-action",
                      dest="noaction",
                      action="store_true",
                      help="does not executes any file action")
    parser.add_option("-d",
                      "--delete",
                      dest="delete",
                      action="store_true",
                      help="delete every duplicated file")
    parser.add_option("-m", 
                      "--move-to",
                      dest="moveto",
                      metavar="DIR",
                      help='Moves duplicated files instead of deleting them')
    parser.add_option("-p",
                      "--print-rm-commands",
                      dest="rmcommands",
                      action="store_true",
                      help="skips delete process and prints a set of \"rm\" " + 
                      "commands so you can delete the duplicate files yourself")
    parser.add_option("-i",
                      "--interactive",
                      dest="interactive",
                      action="store_true",
                      help="interactive mode, will ask for each duplicate. " + 
                      "By default it deletes every duplicate found but " + 
                      "the first one")
    parser.add_option("-y",
                      "--dont-ask-confirmation",
                      dest="noconfirmation",
                      action="store_true",
                      help="skips confirmation question. ")
    parser.add_option("-s",
                      "--store-hashes",
                      dest="storehash",
                      action="store_true",
                      help="store and keep calculated hashes in .digest hidden files ")
    parser.add_option("-t",
                      "--delete-empty-trees",
                      dest="deletedirs",
                      action="store_true",
                      help="deletes empty trees when finishes")
    parser.add_option("-e", 
                      "--ext", 
                      dest="extension", 
                      action="store",
                      help="only digests files with the given extension" )
    parser.add_option("-v",
                      "--verbose",
                      dest="verbose",
                      action="store_true",
                      help="outputs much more information during process " + 
                      "(sometimes even too much)")
    parser.add_option("",
                      "--ignore-stored-hashes",
                      dest="ignorehashes",
                      action="store_true",
                      help="ignores stored calculated hashes in .digest " + 
                      "hidden files, this means every hash will be " + 
                      "recalculated")
    
    (options, args) = parser.parse_args()

    if not args:
        exit_with_error('', parser)
    
    br = Walker(options.extension, \
			options.storehash, \
			options.verbose, \
			options.ignorehashes)

    action = None
    moveto = None
    rmcommands = False
    
    if options.noaction:
        action = 'n'
        
    elif options.moveto:
        action = 'm'
        moveto = options.moveto
        
        if not moveto:
            exit_with_error('No "move to" target provided', parser)
            
        elif not os.path.exists(moveto):
            exit_with_error('Path %s does not exists' % moveto, parser)
            
        elif not os.path.isdir(moveto):
            exit_with_error('Path %s is not a directory' % moveto, parser)
        
    elif options.delete:
        action = 'd'
        rmcommands = options.rmcommands
        
        
    if action is None:
        exit_with_error('No action selected', parser)

    for path in args:
        if not os.path.exists(path):
            exit_with_error("path {0} does not exists".format(path), parser)
        br.digest(path)
    
    duplicates = br.collisions()
    clean = False
    
    if duplicates:

        print "Duplicates found, cleaning..."
        c = Cleaner(
                    duplicates,
                    options.interactive,
                    options.verbose,
                    action,
                    rmcommands,
                    options.noconfirmation,
                    moveto)

        clean = c.clean()
        
    else:
        print "No duplicates found"
        
    if not options.storehash:
        print "Deleting digest files..."
        c = Cleaner(verbose = options.verbose)
        c.delete(br.digestFiles(), -1, True)
        
        
    if options.deletedirs:
        c = Cleaner(verbose = options.verbose)
        for path in args:
            empty_dirs = br.findEmptyDirs(path)
            for dir in empty_dirs:
                if options.rmcommands or options.noaction:
                    print "Keeping empty tree {0}".format(dir)
                else:
                    c.deleteDir(dir)
        
    if clean:
        sys.exit(0)
    else:
        sys.exit(1)
Beispiel #45
0
from os import environ as environment
import argparse, yaml
import logging
from cleaner import Cleaner

parser = argparse.ArgumentParser()
parser.add_argument("--path", help="path to run cleaner on", type=str)
args = parser.parse_args()

# logging.basicConfig(level=logging.DEBUG)


with open("config.yml") as sets:
    config = yaml.load(sets)

path = args.path
if not path:
	path = config["cleaner"]["general_pattern"]

cleaner = Cleaner(config["cleaner"])

print "Cleaning path: " + str(path)
cleaner.clean(path, True)
Beispiel #46
0
def clean_invalid_glyphs_and_remove_hinting(fontfile, hinting, output):
  whitespace_list = get_whitespace_list()
  cleaner = Cleaner(fontfile, hinting, whitespace_list)
  cleaner.clean()
  cleaner.save(output)
  cleaner.close()