Exemple #1
0
def main():
    pool = mp.Pool(4)
    datas = []
    tcpCliSock = socket(AF_INET, SOCK_STREAM)
    tcpCliSock.connect(ADDR)
    tcpCliSock.send(str(PRE_HTML).encode())
    while True:
        buf = tcpCliSock.recv(100).decode()
        if 'WAIT' in buf:
            print("WAIT")
            break
    while True:
        buf = tcpCliSock.recv(100).decode()
        if 'OK' in buf:
            print("OK")
            break
    print("Connect Already ...")
    while True:
        htmls = client.get_data_from_server(tcpCliSock)
        if htmls is None or len(htmls) is 0:
            continue
        parse_jobss = [
            pool.apply_async(analyse_some_att, args=(html, )) for html in htmls
        ]
        url_lists = [j.get() for j in parse_jobss]
        htmls = []
        for url_list in url_lists:
            for url in url_list:
                datas.append(url)
                if len(datas) > 5:
                    client.post_data_to_server(tcpCliSock, datas)
                    datas = []
        client.post_data_to_server(tcpCliSock, datas)
        datas = []
Exemple #2
0
def main():
    tcpCliSock = socket(AF_INET, SOCK_STREAM)
    tcpCliSock.connect(ADDR)
    tcpCliSock.send(str(5).encode())
    while True:
        buf = tcpCliSock.recv(100).decode()
        print(buf)
        if 'WAIT' in buf:
            break
    while True:
        buf = tcpCliSock.recv(100).decode()
        if 'OK' in buf:
            break
    print("Connect Already ...")
    unseen_url = set()
    seen_url = set()
    while True:
        urls = client.get_data_from_server(tcpCliSock)
        unseen_url.update(set(urls) - seen)
        url_list = []
        for url in unseen_url:
            if len(url_list) > 20:
                client.post_data_to_server(tcpCliSock, list(unseen_url))
                url_list = []
                tcpCliSock.send("NEXT".encode())
        client.post_data_to_server(tcpCliSock, list(unseen_url))
        url_list = []
        tcpCliSock.send("NEXT".encode())
        unseen_url.clear()
        seen_url.update(unseen_url)
def main():
    pool = mp.Pool(4)
    locations = pd.Series([0], index=['北京'])
    att_urls = set()
    file_userdata = open('./res/userdata', 'w')
    file_userdata.write("")
    tcpCliSock = socket(AF_INET, SOCK_STREAM)
    tcpCliSock.connect(ADDR)
    tcpCliSock.send(str(USER_HTML).encode())
    while True:
        buf = tcpCliSock.recv(100).decode()
        print(buf)
        if 'WAIT' in buf:
            print("WAIT")
            break
    while True:
        buf = tcpCliSock.recv(100).decode()
        if 'OK' in buf:
            print("OK")
            break
    print("Connect Already ...")
    while True:
        htmls = client.get_data_from_server(tcpCliSock)
        if htmls is None or len(htmls) is 0:
            continue
        print("Already Get data ...")
        print('\nAnalyse attation html ing ...')
        parse_jobs = [
            pool.apply_async(analyse_user_data, args=(html, ))
            for html in htmls
        ]
        user_datas = [j.get() for j in parse_jobs]
        file_userdata = open('./res/userdata', 'a')
        for title, att_url, fans_url, location, autograph in user_datas:
            if att_url is "":
                continue
            elif att_url in att_urls:
                continue
            user_data = "title:" + title + "att_url:" + att_url + "fans_url:" + fans_url + "location:" + location + "autograph:" + autograph + "\n"
            file_userdata.writelines(user_data)
            if location is not "":
                if location in locations:
                    locations[location] += 1
                else:
                    locations[location] = 1
            att_urls.add(att_url)
            if fans_url is not "":
                att_urls.add(fans_url)
        post_datas = []
        for url in att_urls:
            post_datas.append(url)
            print("Start Post Data ...")
            if len(post_datas) > 5:
                client.post_data_to_server(tcpCliSock, post_datas)
                post_datas = []
        client.post_data_to_server(tcpCliSock, post_datas)
        print("\nLocations:\n", locations)
        locations.to_csv('./res/location.csv')
        att_urls.clear()
Exemple #4
0
def get_some_data(tcpCliSock):
    global lock,unseen
    while True:
        urls = client.get_data_from_server(tcpCliSock)
        if urls is not None:
            lock.acquire()
            unseen.update(set(urls))
            lock.release()
Exemple #5
0
def get_some_data(tcpCliSock):
    global unseen, lock
    print("Start Get Data ...")
    while True:
        urls = client.get_data_from_server(tcpCliSock)
        if urls is not None:
            lock.acquire()
            unseen.update(set(urls))
            lock.release()