def main(): pool = mp.Pool(4) datas = [] tcpCliSock = socket(AF_INET, SOCK_STREAM) tcpCliSock.connect(ADDR) tcpCliSock.send(str(PRE_HTML).encode()) while True: buf = tcpCliSock.recv(100).decode() if 'WAIT' in buf: print("WAIT") break while True: buf = tcpCliSock.recv(100).decode() if 'OK' in buf: print("OK") break print("Connect Already ...") while True: htmls = client.get_data_from_server(tcpCliSock) if htmls is None or len(htmls) is 0: continue parse_jobss = [ pool.apply_async(analyse_some_att, args=(html, )) for html in htmls ] url_lists = [j.get() for j in parse_jobss] htmls = [] for url_list in url_lists: for url in url_list: datas.append(url) if len(datas) > 5: client.post_data_to_server(tcpCliSock, datas) datas = [] client.post_data_to_server(tcpCliSock, datas) datas = []
def main(): tcpCliSock = socket(AF_INET, SOCK_STREAM) tcpCliSock.connect(ADDR) tcpCliSock.send(str(5).encode()) while True: buf = tcpCliSock.recv(100).decode() print(buf) if 'WAIT' in buf: break while True: buf = tcpCliSock.recv(100).decode() if 'OK' in buf: break print("Connect Already ...") unseen_url = set() seen_url = set() while True: urls = client.get_data_from_server(tcpCliSock) unseen_url.update(set(urls) - seen) url_list = [] for url in unseen_url: if len(url_list) > 20: client.post_data_to_server(tcpCliSock, list(unseen_url)) url_list = [] tcpCliSock.send("NEXT".encode()) client.post_data_to_server(tcpCliSock, list(unseen_url)) url_list = [] tcpCliSock.send("NEXT".encode()) unseen_url.clear() seen_url.update(unseen_url)
def main(): pool = mp.Pool(4) locations = pd.Series([0], index=['北京']) att_urls = set() file_userdata = open('./res/userdata', 'w') file_userdata.write("") tcpCliSock = socket(AF_INET, SOCK_STREAM) tcpCliSock.connect(ADDR) tcpCliSock.send(str(USER_HTML).encode()) while True: buf = tcpCliSock.recv(100).decode() print(buf) if 'WAIT' in buf: print("WAIT") break while True: buf = tcpCliSock.recv(100).decode() if 'OK' in buf: print("OK") break print("Connect Already ...") while True: htmls = client.get_data_from_server(tcpCliSock) if htmls is None or len(htmls) is 0: continue print("Already Get data ...") print('\nAnalyse attation html ing ...') parse_jobs = [ pool.apply_async(analyse_user_data, args=(html, )) for html in htmls ] user_datas = [j.get() for j in parse_jobs] file_userdata = open('./res/userdata', 'a') for title, att_url, fans_url, location, autograph in user_datas: if att_url is "": continue elif att_url in att_urls: continue user_data = "title:" + title + "att_url:" + att_url + "fans_url:" + fans_url + "location:" + location + "autograph:" + autograph + "\n" file_userdata.writelines(user_data) if location is not "": if location in locations: locations[location] += 1 else: locations[location] = 1 att_urls.add(att_url) if fans_url is not "": att_urls.add(fans_url) post_datas = [] for url in att_urls: post_datas.append(url) print("Start Post Data ...") if len(post_datas) > 5: client.post_data_to_server(tcpCliSock, post_datas) post_datas = [] client.post_data_to_server(tcpCliSock, post_datas) print("\nLocations:\n", locations) locations.to_csv('./res/location.csv') att_urls.clear()
def get_some_data(tcpCliSock): global lock,unseen while True: urls = client.get_data_from_server(tcpCliSock) if urls is not None: lock.acquire() unseen.update(set(urls)) lock.release()
def get_some_data(tcpCliSock): global unseen, lock print("Start Get Data ...") while True: urls = client.get_data_from_server(tcpCliSock) if urls is not None: lock.acquire() unseen.update(set(urls)) lock.release()