def run_bot(): BASE_URL = 'https://www.chamber.nyc/directory.php?search=&page=' NUM_OF_PAGES = 42 # save_pages(BASE_URL, NUM_OF_PAGES) filename = "data/2021/member.csv" f = open(filename, "a+", encoding="utf-8") for file in sorted(glob.glob('data/2021/saved_pages/*.html'), key=lambda x: int(x.split('\page')[1][:-5])): page_soup = parse.load_html(file) extracted_data = parse.parse_html(page_soup) f.write(extracted_data)
def search_for(key): search_field.clear() search_field.send_keys(key) command_click(driver, search_button) # NOTE little hack-ish, but can't find anything better in the API # (the search results open in a window with the same title) for handle in driver.window_handles: if handle != root_handle: # any handle that isn't the root_handle new_window = handle # extract results from new window driver.switch_to_window(new_window) results = parse_html(driver.page_source) # Close window, switch back to search page driver.close() driver.switch_to_window(root_handle) return results
def main(): dir_path = './raw_html' #dir_path = input("Input data file path: ") df_record = [] file_list = get_file_list(dir_path) #print(file_list) for file_name in file_list: file_path = os.path.join(dir_path, file_name) tree = read_html(file_path) record = parse_html(tree) df_record.append(record) df = pd.DataFrame(df_record) lagou = clean_data(df) #print(df.head()) print(lagou) lagou.to_csv('./output/lagou.csv', encoding='gbk')
def main(): LOGIN = True username = sys.argv[1] password = sys.argv[2] s = requests.Session() if LOGIN: s.cookies = login(username, password) else: r = s.get(cons.VISITOR_INCARNATE) r = s.get(cons.WEIBO_MAIN) r = r.content.decode('utf8') uid = re.search("\$CONFIG\['uid'\]='([0-9]+)';", r).group(1) count = 0 for i in range(1, 11): r = s.get(cons.WEIBO_HOME_NUMPAGE.format(uid, i-1, i)) print('PAGE:', i, 'HTML') count += parse_html(r.content.decode('utf8')) for j in range(0, 2): r = s.get(cons.WEIBO_HOME_AJAX.format(i, i, j)) print('PAGE:', i, 'BAR:', j) count += parse_ajax(r.content.decode('unicode-escape')) print(count)
def fetch_remote_item_data(): return parse_html(fetch_html())
import json from output import create_output_dir from output import save_author_data from parse import parse_html if __name__ == '__main__': with open('authors.json', 'r') as file: authors = json.loads(file.read()) output_path = create_output_dir() for author in authors: name = author.get('author').strip() link = author.get('link') html = author.get('html') recordings = parse_html(html, link) author_data = {'name': name, 'link': link, 'recordings': recordings} save_author_data(name, author_data, output_path)
import json from output import create_output_dir from output import save_author_data from parse import parse_html if __name__ == '__main__': with open('authors.json', 'r') as file: authors = json.loads(file.read()) output_path = create_output_dir() for author in authors: name = author.get('author').strip() link = author.get('link') html = author.get('html') recordings = parse_html(html, link) author_data = { 'name': name, 'link': link, 'recordings': recordings } save_author_data( name, author_data, output_path )