def get_thread(link, session=None): session = get_html_session(session) number_of_pages_post = get_num_pages_post(link, session) df_list = [] # Processing post pages for thread_page in range(1, number_of_pages_post + 1, 1): for idx, post in enumerate(get_posts_page(link, thread_page, session)): try: post_dict = get_post(post, link, session) df_list.append(post_dict) except Exception: traceback.print_exc() print("problem with post", idx, ":", link) # Export data posts name_file = link[9:].split("t=") name_file = name_file[-1].split("&") df = pd.DataFrame(df_list) df.to_csv("./data/forums/theattraction/posts/" + name_file[0] + ".csv", index=False) exit()
def get_posts_page(link, thread_page, session=None): session = get_html_session(session) r_post = session.get(link + "&start=" + str((int(thread_page)-1)*30)) # Get elements post return r_post.html.find('#page-body article')
def get_thread(link, session=None): session = get_html_session(session) link = INCELS_URL+link number_of_pages_post = get_num_pages_post(link, session) df_list = [] count_post = 1 # Processing post pages for thread_page in range(1, number_of_pages_post + 1, 1): for idx, post in enumerate(get_posts_page(link, thread_page, session)): try: post_dict = get_post(post, link, session) if post_dict: post_dict["number_post"] = count_post df_list.append(post_dict) count_post = count_post + 1 except Exception: traceback.print_exc() print("problem with post", idx, ":", link) df = pd.DataFrame(df_list) # Export data posts link_array = link[9:].split("=") sufix = re.sub("/", "", link_array[-1]) df.to_csv("./data/forums/redpilltalk/posts/redpilltalk_" + sufix + ".csv", index=False)
def get_thread(link, session=None): session = get_html_session(session) number_of_pages_post = get_num_pages_post(link, session) df_list = [] count_page = 1 # Processing post pages for thread_page in range(1, number_of_pages_post + 1, 1): post_elemenets = get_posts_page(link, thread_page, session) N = len(post_elemenets[0]) for i in range(N): try: element = [] element.append(post_elemenets[0][i]) element.append(post_elemenets[1][i]) post_dict = get_post(element, link, session) post_dict['number_post'] = (count_page * (i+1)) df_list.append(post_dict) except Exception: traceback.print_exc() print("problem with post", i, ":", link) count_page = count_page + 1 # Export data posts df = pd.DataFrame(df_list) df.to_csv("./data/forums/mgtow/posts/" + re.sub("/", "", link[9:].replace(" ", "_")) + ".csv", index=False)
def get_num_pages_post(link, session=None): session = get_html_session(session) r_post = session.get(INCELS_THREAD_BASE + link) # Get number pages for post try: number_of_pages_post = int( [v.text for v in r_post.html.find(".pageNav-page")][-1]) except IndexError: number_of_pages_post = 1 return number_of_pages_post
def get_posts_page(link, thread_page, session=None): session = get_html_session(session) r_post = session.get(INCELS_THREAD_BASE + link + "page/" + str(thread_page)) # Get list elements post post_elemenet = [] post_elemenet.append(r_post.html.find('.hentry')) post_elemenet.append(r_post.html.find('.bbp-reply-header')) return post_elemenet
def get_num_pages_post(link, session=None): session = get_html_session(session) r_post = session.get(INCELS_THREAD_BASE + link) # Get number pages for post try: number_of_pages_post = int(r_post.html.find( ".bbp-pagination-links a")[-2].text) except IndexError: number_of_pages_post = 1 return number_of_pages_post
def get_num_pages_post(link, session=None): session = get_html_session(session) r_post = session.get(link) # Get number pages for post try: number_of_pages_post = int( r_post.html.find(".pagination span a")[-1].text) except IndexError: number_of_pages_post = 1 return number_of_pages_post
def build_index(src, dst, nump): session = get_html_session() # Gets the first page r = session.get(INCELS_URL + str(1)) # Find number of pages number_of_pages = int([v.text for v in r.html.find(".pageNav-page")][-1]) df_list = [] # Process index pages for page_num in range(1, number_of_pages + 1, 1): print("Page {0}/{1}".format(page_num, number_of_pages)) r = session.get(INCELS_URL + str(page_num)) for thread in r.html.find(".structItem--thread"): has_type = thread.find('.labelLink', first=True) is not None if thread.find('.structItem-cell--meta dd')[0].text == "–" or \ thread.find('.structItem-cell--meta dd')[1].text == "–": continue thread_dict = { "type": thread.find('.labelLink', first=True).text if has_type else None, "title": thread.find('.structItem-title a')[1].text if has_type else thread.find('.structItem-title a')[0].text, "link": list(thread.find('.structItem-title a')[1].links)[0] if has_type else list( thread.find('.structItem-title a')[0].links)[0], "author_topic": thread.find('.username')[0].text, "replies": int( re.sub("K", "000", thread.find('.structItem-cell--meta dd')[0].text)), "views": int( re.sub("K", "000", thread.find('.structItem-cell--meta dd')[1].text)), "subforum": "Inceldom Discussion" } df_list.append(thread_dict) # Export data index df = pd.DataFrame(df_list) df.to_csv(dst)
def get_num_pages_post(link, session=None): session = get_html_session(session) r_post = session.get(URL_THREAD_BASE + link) # Get number pages for post try: number_of_pages_post = int( r_post.html.find(".pagination_bottom span")[-3].text) except IndexError: number_of_pages_post = 1 return number_of_pages_post
def build_index(link, dst, nump): session = get_html_session() # Gets the first page r = session.get(INCELS_URL+link+"page/" + str(1)) # Find number of pages number_of_pages = int(r.html.find(".pagination span a")[-1].text) # Get a name of subforum subforum = r.html.find("#page-body h2")[0].text df_list = [] # Get data index for page_num in range(1, number_of_pages + 1, 1): print("Forum: {0} - Page {1}/{2}".format(subforum, page_num, number_of_pages)) r = session.get(INCELS_THREAD_BASE + link+"page/" + str(page_num)) for thread in r.html.find("dl"): author = '' if len(thread.find('dt a')) >= 2: author = thread.find('dt a')[1].text if thread.find(".topictitle"): thread_dict = { "type": None, "title": thread.find(".topictitle")[0].text, "link": str(list(thread.find(".topictitle")[0].links)[0]).replace("./", "/"), "author_topic": author, "replies": int(str(thread.find(".posts")[0].text).replace("Replies", "")), "views": int(str(thread.find(".views")[0].text).replace("Views", "")), "subforum": subforum } df_list.append(thread_dict) # Export data index subforum = subforum.replace(" ", "_") df = pd.DataFrame(df_list) df.to_csv(dst.replace(".csv", "")+"_"+subforum+".csv")
def build_index(link, dst, nump): session = get_html_session() # Gets the first page r = session.get(INCELS_THREAD_BASE + link+"page/" + str(1)) # Find number of pages number_of_pages = int(r.html.find(".bbp-pagination a") [1].text.replace(",", "")) # Get name of subforum subforum = r.html.find(".bbp-breadcrumb-current")[0].text df_list = [] # Get data index for page_num in range(1, number_of_pages + 1, 1): print("Forum: {0} - Page {1}/{2}".format(subforum, page_num, number_of_pages)) r = session.get(INCELS_THREAD_BASE + link+"page/" + str(page_num)) for thread in r.html.find(".status-publish"): if thread.find('.bbp-author-name'): author_topic = thread.find(' .bbp-author-name')[0].text, else: author_topic = None thread_dict = { "type": None, "title": thread.find('.bbp-topic-permalink')[0].text, "link": list(thread.find('.bbp-topic-permalink')[0].links)[0], "author_topic": author_topic, "replies": thread.find(' .bbp-topic-reply-count')[0].text, "views": thread.find(' .bbp-topic-voice-count')[0].text, "subforum": subforum } df_list.append(thread_dict) # Export data index subforum = subforum.replace(" ", "_") df = pd.DataFrame(df_list) df.to_csv(dst.replace(".csv", "")+"_"+subforum+".csv")
def get_thread(link, session=None): session = get_html_session(session) number_of_pages_post = get_num_pages_post(link, session) df_list = [] # Process post pages for thread_page in range(1, number_of_pages_post + 1, 1): for idx, post in enumerate(get_posts_page(link, thread_page, session)): try: post_dict = get_post(post, link, session) df_list.append(post_dict) except Exception: traceback.print_exc() print("problem with post", idx, ":", link) # Export data post df = pd.DataFrame(df_list) df.to_csv("./data/forums/incels/posts/" + re.sub("/", "", link[9:]) + ".csv", index=False)
def build_topics_index(src, dst, nump): session = get_html_session() # Gets the first page r = session.get(INCELS_URL) df_list = [] # Processing data topics index for thread in r.html.find("#forums li ol .forumrow "): thread_dict = { "link": list(thread.find(".forumtitle a")[0].links)[0], "subforum": thread.find('.forumtitle a')[0].text, } df_list.append(thread_dict) # Export data topics index df = pd.DataFrame(df_list) df.to_csv(dst)
def build_topics_index(src, dst, nump): session = get_html_session() # Gets the first page r = session.get(INCELS_URL) df_list = [] # Processing data topics index for thread in r.html.find(".topiclist dl"): if thread.find(".feed-icon-forum"): thread_dict = { "link": str(list(thread.find(".forumtitle")[0].links)[0]).replace("./", "/"), "subforum": thread.find(".forumtitle")[0].text, } df_list.append(thread_dict) # Export data topics index df = pd.DataFrame(df_list) df.to_csv(dst)
dest="debug", action="store_true", help="Runs w/o multiprocessing for debugging.") args = parser.parse_args() os.makedirs(args.dst, exist_ok=True) # Build index topics if args.build_topics_index: build_topics_index(None, args.index_topics, args.nump) # Build index elif args.build_index: topics_list = list(pd.read_csv(args.index_topics)["link"].values) for link in topics_list: build_index(link, args.index, args.nump) # Get data posts else: to_run = list(pd.read_csv(args.index)["link"].values) to_run = list(zip([get_thread] * len(to_run), to_run)) if args.debug: get_html_session = get_html_session() for f, thread in to_run: f(thread, get_html_session) else: p = Pool(args.nump, initializer=initialize_worker) p.starmap(get_thread_global, to_run)
def get_posts_page(link, thread_page, session=None): session = get_html_session(session) r_post = session.get(INCELS_THREAD_BASE + link + "page-" + str(thread_page)) # Get element of post return r_post.html.find('.message--post')
def build_index(link, dst, nump): session = get_html_session() # Gets the first page r = session.get(URL_THREAD_BASE + link + "&page") # Find number of pages number_of_pages = 1 if r.html.find(".first_last a"): url_end_page = str(r.html.find(".first_last a")[0].links) list_aux = url_end_page.split("&") number_of_pages = int(list_aux[1].replace("page=", "")) # Get name of subforum subforum = r.html.find(".forumtitle")[0].text df_list = [] # Get data index for page_num in range(1, number_of_pages + 1, 1): print("Forum: {0} - Page {1}/{2}".format(subforum, page_num, number_of_pages)) r = session.get(URL_THREAD_BASE + link + "&page=" + str(page_num)) thread = list(r.html.find("#thread_inlinemod_form"))[0] len_list = len(r.html.find("#thread_inlinemod_form .inner")) j = 0 # Count element secundary for i in range(len_list): if thread.find('.threadstats li')[j]: strAux = str( thread.find('.threadstats li')[j].text.replace( "Replies:", "")) strAux = strAux.replace(",", "") replies = strAux.replace(" ", "") else: replies = 0 if thread.find('.threadstats li')[j + 1]: strAux = str(thread.find('.threadstats li')[j + 1].text).replace( "Views:", "") strAux = strAux.replace(",", "") views = strAux.replace(" ", "") else: views = 0 thread_dict = { "type": None, "title": str(thread.find('.inner .title')[i].text).replace("\n", ""), "link": str(list(thread.find('.inner .title')[i].links)[0]).replace( "\n", ""), "author_topic": str(thread.find('.inner .author .username ')[i].text).replace( "\n", ""), "replies": replies, "views": views, "subforum": subforum } df_list.append(thread_dict) j = j + 3 # Each element content 3 subelements # Export data index subforum = subforum.replace("?", "-") subforum = subforum.replace("!", "-") subforum = subforum.replace(" ", "_") df = pd.DataFrame(df_list) df.to_csv(dst.replace(".csv", "") + "_" + subforum + ".csv")
def get_posts_page(link, thread_page, session=None): session = get_html_session(session) r_post = session.get(URL_THREAD_BASE + link + "&page=" + str(thread_page)) # Get elements post return r_post.html.find('.postcontainer')