Exemple #1
0
def generate_conf():
    """
    如果配置文件不存在,则生成
    :return:
    """
    conf = os.path.join(os.getcwd(), "app.conf")
    if not os.path.exists(conf):
        content = "# 全局配置\n"
        content += "[APP]\n"
        content += "# 数据存放目录\n"
        content += "DATA_DIR = \n"
        content += "# 使用哪种数据库:MySQL或者Sqlite3\n"
        content += "DATABASE = \n"
        content += "# MySQL配置\n"
        content += "[MYSQL]\n"
        content += "# 地址\n"
        content += "HOST = \n"
        content += "# 端口\n"
        content += "PORT = \n"
        content += "# 用户名\n"
        content += "USER = \n"
        content += "# 密码\n"
        content += "PASSWORD = \n"
        content += "# 数据库名称\n"
        content += "BD_NAME = \n"
        content += "# 字符编码\n"
        content += "CHARSET = \n"
        content += "# Sqlite3配置\n"
        content += "[SQLITE3]\n"
        content += "# 数据库名称\n"
        content += "BD_NAME = \n"
        content += "# 字符编码\n"
        content += "CHARSET = \n"
        FileUtil.writ_file(conf, content)
def download_chromedriver():
    """
    下载chrome驱动
    http://chromedriver.storage.googleapis.com/index.html
    :return:
    """
    # 获取版本号列表
    url = "http://chromedriver.storage.googleapis.com/"
    result = BeautifulSoup(HttpUtil.get(url, {
        "delimiter": "/",
        "prefix": ""
    }).text,
                           features="lxml")
    prefix = result.find_all("prefix")
    # 过滤
    # info = [s.extract() for s in prefix('prefix')]

    local_version = get_local_version(prefix)

    # 获取版本下面的文件列表
    driver_list = BeautifulSoup(HttpUtil.get(url, {
        "delimiter": "/",
        "prefix": local_version
    }).text,
                                features="lxml")
    filename_list = driver_list.find_all("key")

    for s in filename_list:
        s = s.text
        # 如果在文件名中找到系统平台名称
        if s.find(sys.platform) != -1:
            filename = s[len(local_version):]
            # 下载文件
            HttpUtil.download_file(url + s, None, filename)
            FileUtil.zip_extract(filename, None)
def download_taobao_chromedriver():
    """
    下载淘宝镜像chromedriver
    http://npm.taobao.org/mirrors/chromedriver
    :return:
    """
    # 获取版本号列表
    url = "http://npm.taobao.org/mirrors/chromedriver/"
    result = BeautifulSoup(HttpUtil.get(url).text, features="lxml")
    prefix = result.find("pre").find_all("a")
    # 过滤
    # info = [s.extract() for s in prefix('prefix')]

    local_version_url = url + get_local_version(prefix)

    # 获取版本下面的文件列表
    driver_list = BeautifulSoup(HttpUtil.get(local_version_url).text,
                                features="lxml")
    filename_list = driver_list.find_all("a")

    for s in filename_list:
        s = s.text
        # 如果在文件名中找到系统平台名称
        if s.find(sys.platform) != -1:
            # 下载文件
            HttpUtil.download_file(local_version_url + s, None, s)
            FileUtil.zip_extract(s, None)
def getSVC(target, labels, train_new):
    #如果不是重新train,就读取以前的数据
    if not train_new:
        classifier = svm.SVC(kernel='rbf', C=2, gamma=0.001, probability=True)
        classifier = joblib.load(Constants.SKL_DATA_FILE_NAME)
        return classifier
    # First half is trainData, remaining is testData
    train_cells = target
    ######     Now training      ########################
    deskewed = [map(deskew, row) for row in train_cells]
    hogdata = [map(hog, row) for row in deskewed]
    trainData = np.float32(hogdata).reshape(-1, 64)
    print(trainData.shape)

    #response is array of n * 1, i.e. [[1],[2],[3]...]
    responses = np.int8(labels)
    #save target and labels to file
    FileUtil.writeTarget(trainData)
    FileUtil.writeLabels(responses)
    # Create a classifier: a support vector classifier
    '''
    Use params   in GridSearch
    '''
    #classifier = svm.SVC(gamma=0.001,probability=True)
    classifier = svm.SVC(kernel='rbf', C=2, gamma=0.001, probability=True)
    # We learn the digits on the first half of the digits
    print(trainData.shape)
    print(responses.shape)
    classifier.fit(trainData, labels)
    #dump train data
    joblib.dump(classifier, Constants.SKL_DATA_FILE_NAME)

    return classifier
def download_and_save_abstracts_for_search_term(search_term, dataset, max_ids):
    abstracts_df = retrieve_pubmed_abstracts([search_term], max_ids)
    dataset_output_directory = os.path.join(global_output_directory_name,
                                            dataset)
    FileUtil.create_directory_if_not_exists(dataset_output_directory)
    abstracts_df.to_csv(os.path.join(dataset_output_directory,
                                     'abstracts.csv'))
    return abstracts_df
Exemple #6
0
def update_hosts(new_hosts):
    """
    更新覆写hosts
    :param new_hosts: 新的dns数组
    :return:
    """
    FileUtil.remove_read_only(Constants.HOSTS_PATH)
    FileUtil.write_lines(Constants.HOSTS_PATH, new_hosts)
    # 刷新dns
    os.system("ipconfig /flushdns")
Exemple #7
0
def run_command(directory):
    dir_size = FileUtil.count_dir_size(directory)
    if dir_size >= 107374182400:
        print(FileUtil.size_unit_format(dir_size))
        print(
            os.system(
                "rclone move /home/reptile-python/images/ gdrive:/images --min-size 100k"
            ))
        print(FileUtil.size_unit_format(FileUtil.count_dir_size(directory)))
    print(os.popen("rclone dedupe gdrive:/images --dedupe-mode newest").read())
    print(os.popen("rclone delete gdrive:/images --max-size 100k").read())
    threading.Timer(21600, run_command, (directory, )).start()
Exemple #8
0
 def filePath(self):
     """
     
     :return: 
     """
     fileName = os.path.join(os.getcwd(), "content.txt")
     self.assertEqual(os.getcwd(), FileUtil.dirname(fileName))
Exemple #9
0
def get_bing():
    """
    获取必应图片地址
    :return:
    """
    data = {'format': 'js', 'idx': 0, 'n': 1}
    try:
        response = HttpUtil.get_json(
            url='http://cn.bing.com/HPImageArchive.aspx', data=data)
        logging.debug(response)
    except Exception as e:
        logging.error("网络请求错误:", e)
        time.sleep(120)
        get_bing()
    images = response["images"]
    url = "http://cn.bing.com" + images[0]["url"].split("&")[0]

    # 拼接目录路径
    directory = os.path.join(Constants.APP_DIRECTORY, "images")

    image_name = url.split("=")[1]
    # 拼接文件绝对路径
    image_path = os.path.join(directory, image_name)
    # 下载图片
    HttpUtil.download_file(url, directory, image_name)
    # 分割文件名和后缀,如果后缀不为bmp
    if os.path.splitext(image_name)[1] != "bmp":
        # 转为bmp
        image_path = FileUtil.image_to_bmp(image_path)
    def find_and_save_food_disease_dfs(self, ids_and_abstracts, dataset):

        save_directory = os.path.join(global_output_directory_name, dataset)
        FileUtil.create_directory_if_not_exists(save_directory)

        for extractor in self.food_extractors + self.disease_extractors:
            print(extractor.name)
            df_to_save = pd.DataFrame()
            i = 0
            save_file = os.path.join(
                save_directory,
                '{extractor_name}.csv'.format(extractor_name=extractor.name))
            if not os.path.isfile(save_file):
                for (file_name, file_content) in ids_and_abstracts:
                    doc = self.english_model(file_content)
                    #print(i)
                    i += 1
                    file_name = str(file_name)
                    try:
                        extracted_df = extractor.extract(doc,
                                                         file_name,
                                                         self.dataset,
                                                         save_entities=False)
                        extracted_df['extractor'] = extractor.name
                        extracted_df['file_name'] = file_name
                        df_to_save = df_to_save.append(extracted_df)
                    except:
                        if self.verbose:
                            print('Error happened')
                            traceback.print_exc(file=sys.stdout)

                    if i % 1000 == 0:
                        df_to_save.drop_duplicates().to_csv(
                            os.path.join(
                                save_directory,
                                '{extractor_name}_{i}.csv'.format(
                                    extractor_name=extractor.name, i=i)))
                if df_to_save.shape[0] == 0:
                    df_to_save = pd.DataFrame(columns=[
                        'start_char', 'end_char', 'entity_type', 'entity_id',
                        'text', 'sentence', 'sentence_index', 'extractor',
                        'file_name'
                    ])
                df_to_save.drop_duplicates().to_csv(save_file)
            else:
                print('File already exists: {0}'.format(save_file))
Exemple #11
0
def delete_dns(dns):
    """
    删除数组中的dns
    :param dns: 数组
    :return: 删除后的hosts
    """
    hosts = FileUtil.read_file(Constants.HOSTS_PATH)
    new_hosts = []
    for host in hosts:
        if not ObjectUtil.is_in_array(host.strip("\n"), dns):
            new_hosts.append(host)
    return new_hosts
Exemple #12
0
def init():
    """
    初始化
    :return:
    """
    app = FileUtil.Config("app.conf")
    data_dir = app.get("APP", "DATA_DIR")
    if data_dir == "" or data_dir is None:
        raise ValueError("请配置数据存放目录!")

    # 如果目录不存在
    if not os.path.exists(data_dir):
        # 父级目录
        parent_path = os.path.dirname(os.path.dirname(__file__))
        # 拼接目录
        data_dir = os.path.join(parent_path, data_dir)
        # 创建目录
        os.mkdir(data_dir)

    database = app.get("APP", "DATABASE")
    if database == "" or database is None or (database.lower() != "mysql" and
                                              database.lower() != "sqlite3"):
        raise ValueError("请配置使用的数据库!")
    if database.lower() == "mysql":
        database = database.upper()
        host = app.get(database, "HOST")
        if host == "" or host is None:
            raise ValueError("请配置MySQL数据库地址!")
        port = app.get(database, "PORT")
        if port == "" or port is None:
            raise ValueError("请配置MySQL数据库端口!")
        user = app.get(database, "USER")
        if user == "" or user is None:
            raise ValueError("请配置MySQL数据库用户名!")
        password = app.get(database, "PASSWORD")
        if password == "" or password is None:
            raise ValueError("请配置MySQL数据库密码!")
        db_name = app.get(database, "BD_NAME")
        if db_name == "" or db_name is None:
            raise ValueError("请配置MySQL数据库名称!")
        charset = app.get(database, "CHARSET")
        if charset == "" or charset is None:
            raise ValueError("请配置MySQL数据库字符编码!")

    if database.lower() == "sqlite":
        database = database.upper()
        db_name = app.get(database, "BD_NAME")
        if db_name == "" or db_name is None:
            raise ValueError("请配置Sqlite3数据库名称!")
        charset = app.get(database, "CHARSET")
        if charset == "" or charset is None:
            raise ValueError("请配置Sqlite3数据库字符编码!")
Exemple #13
0
    def __init__(self, configFile='redis.properties'):
        """
        初始化 默认: redis.properties
        :param configFile: 文件名
        """
        config = FileUtil.Properties(configFile)
        password = None
        if not CommonUtil.isEmpty(config.get('pass')):
            password = config.get('pass')

        super(RedisClient, self).__init__(host=config.get('host'),
                                          port=config.get('port', 6379),
                                          password=password)
Exemple #14
0
def generate_conf():
    """
    如果配置文件不存在,则生成
    :return:
    """
    conf = os.path.join(os.getcwd(), "app.conf")
    if not os.path.exists(conf):
        content = """\
        # 全局配置
        [APP]
        # 数据存放目录
        DATA_DIR = 
        # 使用哪种数据库:MySQL或者Sqlite3
        DATABASE = 
        # MySQL配置
        [MYSQL]
        # 地址
        HOST = 
        # 端口
        PORT = 
        # 用户名
        USER = 
        # 密码
        PASSWORD = 
        # 数据库名称
        BD_NAME = 
        # 字符编码
        CHARSET = 
        # Sqlite3配置
        [SQLITE3]
        # 数据库名称
        BD_NAME = 
        # 字符编码
        CHARSET = 
        """
        FileUtil.writ_file(conf, content)
Exemple #15
0
    def get_statistic_row(self, _rows, _pc, _description, _batch_no,
                          _curr_path, _root_path, _level, logger):
        """
        递归获取文件夹子文件夹的信息
        @author [email protected]
        :param _rows:
        :param _pc: pc mark
        :param _description: root dir mark
        :param _batch_no:
        :param _curr_path:
        :param _root_path:
        :param _level:
        :param logger:
        :return:
        """
        _logger = logger.getChild('get_statistic_row')
        file_num = 0
        dir_num = 0
        file_size = 0
        zero_num = 0
        _file_num = 0  # 文件数
        _dir_num = 0  # 文件夹数
        _file_size = 0  # 文件大小和
        _zero_num = 0  # 空文件数
        _row = []
        _logger.debug("start - %s" % _curr_path)
        for loop_dir_path, loop_dir_names, loop_file_names in os.walk(
                _curr_path, followlinks=False):
            # _file_num = loop_file_names.__len__() #会统计快捷方式
            # _dir_num = loop_dir_names.__len__() #会统计快捷方式
            if len(loop_file_names) > 0:
                for child_file_name in loop_file_names:
                    temp_path = os.path.join(loop_dir_path, child_file_name)
                    ext = os.path.splitext(temp_path)[1][1:].lower()
                    if ext != 'lnk' and os.path.isfile(temp_path):
                        _file_num += 1
                        _temp_size = os.path.getsize(temp_path)
                        _file_size += _temp_size
                        if _temp_size == 0:
                            _zero_num += 1
            if len(loop_dir_names) > 0:
                for child_dir_name in loop_dir_names:
                    temp_path = os.path.join(loop_dir_path, child_dir_name)
                    real_path = os.path.realpath(temp_path)
                    if temp_path == real_path:
                        arr = self.get_statistic_row(_rows, _pc, _description,
                                                     _batch_no, temp_path,
                                                     _root_path, _level + 1,
                                                     logger)
                        file_num += arr[0]
                        dir_num += arr[1]
                        file_size += arr[2]
                        zero_num += arr[3]
                        _dir_num += 1
            file_num += _file_num
            dir_num += _dir_num
            file_size += _file_size
            zero_num += _zero_num
            break
        if _level <= self.max_level:
            _dir_name = os.path.basename(_curr_path)
            _dir_path = os.path.abspath(_curr_path)
            _parent_path = os.path.abspath(
                os.path.dirname(_curr_path) + os.path.sep + ".")

            _row.append(_pc)
            _row.append(_description)
            _row.append(_batch_no)
            _row.append(_dir_name)
            _row.append(_dir_path)
            _row.append(_parent_path)
            _row.append(_root_path)
            _row.append(_level)
            _row.append(_file_num)
            _row.append(file_num)
            _row.append(_dir_num)
            _row.append(dir_num)
            _row.append(FileUtil.format2MB(_file_size))
            _row.append(FileUtil.format2MB(file_size))
            _row.append(_zero_num)
            _row.append(zero_num)
            _row.append((datetime.now()).strftime('%Y-%m-%d %H:%M:%S'))
            _rows.append(_row)
            self.total_num += 1
            if _rows.__len__() == 500:
                temp_rows = _rows.copy()
                CsvUtil.write(self.work_csv, [], temp_rows)
                self.logger.info("csv export 500.")
                _rows.clear()
        arr = [file_num, dir_num, file_size, zero_num]
        _logger.debug("end - %s ,%s" % (_curr_path, _row))
        self.trace_num += 1
        if self.trace_num % 10000 == 0:
            _logger.info("current trace num: %d %s" %
                         (self.trace_num, _curr_path))
        return arr
Exemple #16
0
    def clear(event):
        nonlocal draw_data
        event.widget.delete("all")
        draw_data = [[0] * 28 for _ in range(28)]
        
    def done(_):
        master.destroy()
    
    master = tk.Tk()
    canvas = tk.Canvas(master, width=560, height=560)
    canvas.pack()
    canvas.bind('<ButtonPress-1>', draw)
    canvas.bind('<B1-Motion>', draw)
    canvas.bind('<Double-1>', clear)
    canvas.bind('<ButtonPress-2>', done)
    
    master.mainloop()
    return np.array([a for b in draw_data for a in b])

    
if __name__ == "__main__":
    dataset_index = -3
    data = fUtil.load_data(fUtil.TRAINING_DATA_NAMES[dataset_index])
    name = fUtil.TRAINING_DATA_NAMES[dataset_index][:1].upper() + fUtil.TRAINING_DATA_NAMES[dataset_index][1:-4]
    show_image(data[0])
    show_image(data, bulk_size=49, name=name)
    data = show_drawable_canvas()
    print(data)
    show_image(data)
Exemple #17
0
# Option: save training array to cache
SAVE_TO_CACHE = False
# Option: save trained neural network data
SAVE_NEURAL_NETWORK = False
# Option: skip the training of the neural network (debugging)
SKIP_TRAINING = True

# Code starts
if not LOAD_FROM_CACHE:
    # Load training / testing data
    print("Loading training and testing data...")
    training, testing = [], []  # Array of tuples: (image, answer)
    category_count = len(fUtil.TRAINING_DATA_NAMES)
    for category in fUtil.TRAINING_DATA_NAMES:
        # Print the progress
        print(">> Loading " + fUtil.get_name(fUtil.get_index(category)) +
              "... (" + str(fUtil.get_index(category) + 1) + "/" +
              str(category_count) + ")")
        # Normalize the data for more efficiency
        data = fUtil.load_data(category, normalize=True)
        # Split the data into training data and testing data
        train_limit = int(len(data) * TRAIN_TEST_RATIO)
        index = fUtil.get_index(category)
        # Append the current data to master data list
        training += [(image_data,
                      [1 if a == index else 0 for a in range(category_count)])
                     for image_data in data[:train_limit]]
        testing += [(image_data,
                     [1 if a == index else 0 for a in range(category_count)])
                    for image_data in data[train_limit:]]
Exemple #18
0
    def testWrite(self):
        """
        
        :return: 
        """
        fileName = os.path.join(os.getcwd(), "content.txt")
        exist = FileUtil.existFile(fileName)
        if exist:
            FileUtil.delFile(fileName)

        FileUtil.writeContent(fileName, "HelloWorld")
        FileUtil.writeContent(fileName, "HelloWorld")

        content = FileUtil.readContent(fileName)
        self.assertEqual(content, "HelloWorldHelloWorld")

        FileUtil.delFile(fileName)
        self.assertFalse(FileUtil.existFile(fileName))
Exemple #19
0
 def save(self, doc: Doc, objects: List, file_name: str, file_subdirectory: str):
     output_directory = self.get_output_directory(file_subdirectory)
     FileUtil.create_directory_if_not_exists(output_directory)
     doc, objects_column_names = self.prepare_doc_for_saving(doc, objects)
     doc.to_disk(f'{output_directory}/{file_name}')
     PandasUtil.write_object_list_as_dataframe_file(doc._.entities, file_name, f'{output_directory}/as_df', columns=objects_column_names)