Python get_file_contentの例、crawl.util.get_file_content Pythonの例

コード例 #1

0

ファイルを表示

ファイル: retry_new.py プロジェクト: strategist922/drug_crawl

def compare_url():
    """
    txt 的 link 的 id  VS xls 的数据 id
    :return:
    """
    for directory in FILE_DIR_LIST:
        l_arr = [
            f for f in os.listdir(os.getcwd() + directory + folder[0])
            if f[-3:] == 'txt'
        ]
        d_arr = [
            f for f in os.listdir(os.getcwd() + directory + folder[1])
            if f[-3:] == 'xls'
        ]
        for l, d in zip(l_arr, d_arr):
            print(l + ' vs ' + d)
            error = []
            list_content = get_file_content(directory + folder[0] + l)
            detail_content = get_file_pd(directory + folder[1] + d)
            for l_line, d_line in zip(list_content,
                                      range(detail_content.shape[0])):
                l_url = str(l_line).split(',')[-1].strip()
                l_id = int(l_url.split('=')[-1])
                # print(l_line, d_line)
                try:
                    d_id = int(detail_content.iat[d_line, 1])
                except ValueError:
                    error.append(d_id)
                if l_id != d_id:
                    error.append(l_line.split('.')[0])
                    print('row ' + l_line.split('.')[0].__str__() + ' : ' +
                          l_id.__str__() + ' == ' + d_id.__str__())
            if error.__len__():
                print('with ' + error.__len__().__str__() + ' error has found')
            print('--------------------------------')

コード例 #2

0

ファイルを表示

ファイル: retry_new.py プロジェクト: strategist922/drug_crawl

def txt_check(directory, file_arr):
    """
    check whether there is failed in txt

    :param directory:
    :param file_arr:
    :return:
    """
    for l in file_arr:
        f_path = directory + folder[0] + l
        start = int(re.compile('-').split(l)[2]) - 1
        print(f_path)
        content = get_file_content(f_path)
        error = []
        fail = []
        for index, item in enumerate(content):
            if not item.__contains__('failed'):
                if item.strip().split('.')[0]:
                    number = int(item.strip().split('.')[0]) - (15 * start)
                    if number != index + 1:
                        error.append(item)
                        print(index.__str__() + ' == ' + item)
            else:
                fail.append(item)
                # print(item)
        if error.__len__() or fail.__len__():
            print('check ' + f_path + ' finish : with ' +
                  fail.__len__().__str__() + ' failed and ' +
                  error.__len__().__str__() + ' error index has found')
        else:
            print('check ' + f_path + ' finish : with no error')
    print('------ ' + directory + ' txt check finish -------')

コード例 #3

0

ファイルを表示

ファイル: retry_new.py プロジェクト: strategist922/drug_crawl

def fill_xls(directory, l, d):
    """
    填补xls
    insert and append
    注意，会因为内容格式……emmm……太花哨而fail

    :param directory: category
    :param l: list file,txt
    :param d: detail file,xls
    """
    list_content = get_file_content(directory + folder[0] + l)
    detail_content = get_file_pd(directory + folder[1] + d)
    start = (int(re.compile('-').split(d)[2]) - 1) * 15
    browser = webdriver.Chrome()
    col_len = len(detail_content.columns)
    try:
        while detail_content.shape[0] < list_content.__len__():
            detail_content = insert_or_append_xls(browser, detail_content,
                                                  list_content, start, col_len)
            detail_content.to_excel(os.getcwd() + directory + folder[1] + d,
                                    index=False)

    finally:
        print('fill_xls() end')
        browser.close()
        detail_content.to_excel(os.getcwd() + directory + folder[1] + d,
                                index=False)

コード例 #4

0

ファイルを表示

ファイル: crawl_details_new.py プロジェクト: strategist922/drug_crawl

def output_excel(file, name):
    data = pd.DataFrame()
    txt_content = get_file_content(file)
    b = webdriver.Chrome()
    # read file ,get url,get No.
    for line in txt_content:
        if line.strip():
            try:
                drug_url = line.split(',')[-1]
                drug_id = line.split('=')[-1]
                # crawl detail
                detail = crawl_detail(b, drug_url)
                if detail:
                    detail.insert(1, str(drug_id).strip())
                    detail_arr = arrange(detail[1:-8], drug_url)
                    title = get_title(detail[1:-8])
                    data = data.append(pd.DataFrame(columns=title,
                                                    data=[detail_arr]),
                                       ignore_index=True,
                                       sort=False)
                else:
                    data = data.append(pd.DataFrame(data=[line + ',None']),
                                       ignore_index=True,
                                       sort=False)
            finally:
                data.to_excel(name + '.xlsx')

コード例 #5

0

ファイルを表示

ファイル: retry_new.py プロジェクト: strategist922/drug_crawl

def txt_retry(path):
    """
    replace 'page xx crawl failed' in txt
    :param path:
    :return:
    """
    content = get_file_content(path)

    txt_file = pd.DataFrame(content)
    browser = webdriver.Chrome()
    try:
        for row in reversed(range(txt_file.shape[0])):
            s = str(txt_file.iat[row, 0]).strip()
            if not s:
                txt_file.drop(row)
            if s.__contains__('failed'):
                page = s.split(' ')[1]
                print(path + ' , ' + page)
                url_dir = get_ids(browser, util.get_list_url(page))
                if url_dir:
                    count = 0
                    txt_file = txt_file.drop([row], axis=0)
                    for k, v in url_dir.items():
                        new_str = '{name},{urls}'.format(name=k, urls=v + '\n')
                        txt_file = pd.DataFrame(
                            pd.np.insert(txt_file.values, row + count,
                                         [new_str]))
                        count += 1
        txt_file.sort_index(axis=0)
    except Exception as e:
        print('txt_retry() Exception : ' + e.__str__())
    finally:
        pd.np.savetxt(os.getcwd() + path,
                      txt_file.values,
                      fmt='%s',
                      encoding='utf-8',
                      newline='')
        print('txt_retry() ' + path + ' finish.')

コード例 #6

0

ファイルを表示

ファイル: retry_new.py プロジェクト: strategist922/drug_crawl

def compare_txt_xls(d, directory, l):
    print(l + ' vs ' + d)
    list_content = get_file_content(directory + folder[0] + l)
    detail_content = get_file_pd(directory + folder[1] + d)
    error = []
    for l_line, d_line in zip_longest(list_content,
                                      range(detail_content.shape[0]),
                                      fillvalue=None):
        # print(l_line, int(detail_content.iat[d_line, 0]))
        txt_number = int(l_line.split('.')[0])
        if d_line is None:
            xls_number = -1
        else:
            xls_number = int(detail_content.iat[d_line, 0])
        if not txt_number == xls_number:
            print(txt_number, xls_number)
            error.append(txt_number)
    if error.__len__():
        print(l + ' vs ' + d + ' finish : with ' + error.__len__().__str__() +
              ' error found')
    else:
        print(l + ' vs ' + d + ' finish : with no error')
    print('------------------------------------')

コード例 #7

0

ファイルを表示

ファイル: retry_new.py プロジェクト: strategist922/drug_crawl

def fill_txt(directory, l):
    """
    填补xls
    insert and append
    :param directory:
    :param l:
    :return:
    """
    start = int((l.split('-')[2]).split('.')[0]) - 1
    end = int((l.split('-')[3]).split('.')[0])
    # print('start:' + start.__str__() + ' , end =' + end.__str__())
    list_content = get_file_content(directory + folder[0] + l)
    # print(list_content.__len__())
    browser = webdriver.Chrome()
    # insert
    try:
        for index, line in enumerate(list_content):
            if line.strip():
                number = int(line.split('.')[0]) - start * 15
                if index + 1 != number:
                    print(index + 1, number)
                    page = math.ceil(index / 15) + start
                    urls = get_ids(browser, get_list_url(page))
                    # print(urls)
                    if urls:
                        for k, v in urls.items():
                            if int(str(k).split('.')[0]) == (index +
                                                             start * 15):
                                insert_str = '{name},{urls}'.format(name=k,
                                                                    urls=v +
                                                                    '\n')
                                print(l + ' ' + index.__str__() +
                                      ' row insert : ' + insert_str)
                                list_content.insert(index, insert_str)
                    else:
                        list_content.insert(index,
                                            index.__str__() + '. crawl failed')

        print('--- ' + l + ' insert finish. ---')

        # append
        last_index = list_content.__len__()
        from_page = int(round((last_index / 15) + start, 0))
        # print(from_page, end)
        if from_page != end and from_page <= end:
            last_content = (from_page - 1) * 15
            list_content = list_content[0:last_content]
            while from_page <= end:
                try:
                    url = util.get_list_url(from_page)
                    print(l + ' append page ' + from_page.__str__() + ' , ' +
                          url)
                    url_list = get_ids(browser, url)
                    try:
                        for k, v in url_list.items():
                            if not v.startswith('http'):
                                raise Exception('cannot get page content : ' +
                                                k + ',' + v)
                            append_str = k + ',' + v
                            list_content.append('\n' + append_str)
                            # print(append_str)
                    except Exception as e:
                        list_content.append('page ' + from_page.__str__() +
                                            ' crawl failed')
                        print(l + ' page ' + from_page.__str__() +
                              ' crawl failed , with exception :' + e.__str__())
                except Exception as e:
                    list_content.append(' page ' + from_page.__str__() +
                                        ' crawl failed')
                    print(l + ' page ' + from_page.__str__() +
                          ' crawl failed, with exception : ' + e.__str__())
                    continue
                finally:
                    from_page += 1

        print('--- ' + l + ' append finish. ---')
    except WebDriverException as e:
        print(e.__str__())
    finally:
        with codecs.open(os.getcwd() + directory + folder[0] + l,
                         'wb',
                         encoding='utf-8') as f:
            f.write(''.join(list_content))
            f.close()