def get_thread_link_from_android_forum(pagination_url_list, selected_dates): thread_link_list = [] thread_name_list = [] product_name_list = [] dic_thread_name = defaultdict(list) for url in pagination_url_list: soup = parse(url) product_name = (soup.find("div", class_="channel_title")).get_text() for thread in soup.find_all("div", class_="listBlock main"): for child_thread in thread.find_all("div", class_="secondRow"): date = child_thread.find("abbr", class_="uix_DateTime") thread_date = date.attrs['data-datestring'] strip_date = thread_date.strip() pattern = re.compile( "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\s+\d{1,2},\s+\d{4}" ) date = pattern.search(strip_date).group() converted_date = datetime.strptime( date, '%b %d, %Y').strftime('%m/%d/%Y') cdate = datetime.strptime(converted_date, '%m/%d/%Y').date() sdate = datetime.strptime(selected_dates[-1], '%m/%d/%Y').date() s1date = datetime.strptime(selected_dates[0], '%m/%d/%Y').date() if (cdate > sdate): break elif (cdate < s1date): pass else: for date in selected_dates: if date == converted_date: if thread.find("a", class_="PreviewTooltip"): link = thread.find("a", class_="PreviewTooltip") thread_link_list.append(link.attrs['href']) thread_name_list.append(link.get_text()) product_name_list.append(product_name) else: for child_thread in thread.find_all("a", class_=""): attrs_dict = child_thread.attrs if attrs_dict: thread_link_list.append( attrs_dict['href']) thread_name_list.append( child_thread.get_text()) product_name_list.append(product_name) else: pass print("thread_name_list", thread_name_list) i = 0 for key in thread_link_list: dic_thread_name[key].append(thread_name_list[i]) dic_thread_name[key].append(product_name_list[i]) i += 1 return dic_thread_name
def get_models_names_from_gsmarena(request,soup,url): list_page = pagination_for_mobile_brand_list_from_gsmarena(soup,url) mobile_model_name_list = [] mobile_model_links_list = [] mobile_model_year_list = [] main_url = [] mobile_brand = request.session.get('brand') for l in list_page: url = GSMARENA_URL+l soup = parse(url) for mobile_model_container in soup.find_all("div", class_="makers"): for l in mobile_model_container.find_all("li"): mobile_model_year = l.find("img") year_string = mobile_model_year.attrs["title"] split_year = re.search("\.?([^\.]*Announced[^\.]*)", year_string) mobile_model_name = l.find("span") model_links = l.find("a") if split_year is not None: pattern = re.compile(r"(\w+)$") year = pattern.search(split_year.group(1)) main_url.append(GSMARENA_URL) mobile_model_year_list.append(year.group(0)) mobile_model_name_list.append(mobile_model_name.text) mobile_model_links_list.append(model_links.attrs['href']) else: main_url.append(GSMARENA_URL) mobile_model_year_list.append("Other") mobile_model_name_list.append(mobile_model_name.text) mobile_model_links_list.append(model_links.attrs['href']) model_dictionary = {MAIN_URL_KEY:main_url, BRAND_NAME_KEY: mobile_brand, ANNOUNCED_YEAR_DICT_KEY:mobile_model_year_list, MODEL_NAME_DICT_KEY:mobile_model_name_list, MODEL_LINK_DICT_KEY:mobile_model_links_list} dic_year = defaultdict(list) dic_model_name=defaultdict(list) i = 0 for key in mobile_model_year_list: dic_year[key].append(mobile_model_name_list[i]) i += 1 j = 0 for mobile_name_key in mobile_model_name_list: dic_model_name[mobile_name_key].append(mobile_model_links_list[j]) j += 1 Write_to_DB(model_dictionary,MODEL_NAME_DATABASE_TABLE) return dic_year,dic_model_name
def pagination_for_thread_links(model_url): page_url = model_url + "/page/%s" pagination_list = [] soup = parse(model_url) if soup.find("ul", {"class": "lia-paging-full-pages"}): number_of_pages = soup.find("ul", {"class": "lia-paging-full-pages"}) page_text = number_of_pages.text page_number_list = re.findall(r'\d+', page_text) list_last_page_number = page_number_list[-1] for i in range(1, int(list_last_page_number) + 1): urls = page_url % i # make a url list and iterate over it pagination_list.append(urls) return pagination_list
def pagination_for_user_comment_links(model_url): pagination_list = [] soup = parse(model_url) for node in soup.find_all("div", class_="PageNav"): child_node = node.find("span", class_="pageNavHeader") page_header_text = (child_node.text).split(" ") page_number = page_header_text[len(page_header_text) - 1] for num in range(int(page_number)): num = num + 1 url = model_url + "page-" + str(num) pagination_list.append(url) return pagination_list
def get_models_names(request, url): model_name_dic = () soup = parse(url) if GSMARRENS_STRING in url: model_name_dic = get_models_names_from_gsmarena(request, soup, url) elif ANDROID_FORUM_STRING in url: model_name_dic = get_models_names_from_android_forum(request, soup) elif ANDROID_PIT_FORUM_STRING in url: model_name_dic = get_models_names_from_android_pit_forum( request, soup, url) elif GADGETS_FORUM_STRING in url: model_name_dic = get_models_names_from_gadgets360(request, soup) elif SONY_FORUM_STRING in url: model_name_dic = get_models_names_from_sonyforum(request, soup) return model_name_dic
def get_brand_names(request): """ :param url: :return: """ url = request.session.get('mainurl') if ANDROID_FORUM_STRING in url: url = url + "devices/list/" elif ANDROID_PIT_FORUM_STRING in url: url = url + "forum/" elif GADGETS_FORUM_STRING in url: url = url + "mobiles/all-brands" elif SONY_FORUM_STRING in url: url = url + "/t5/Phones-Tablets/ct-p/Phones" print("url", url) soup = parse(url) mobile_brand_list = [] mobile_brand_links_list = [] if GSMARRENS_STRING in url: get_brand_name_from_gsmarena(soup, mobile_brand_list, mobile_brand_links_list) elif ANDROID_FORUM_STRING in url: get_brand_name_from_androidforum(soup, mobile_brand_list, mobile_brand_links_list) elif ANDROID_PIT_FORUM_STRING in url: get_brand_name_from_androidpit_forum(soup, mobile_brand_list, mobile_brand_links_list) elif GADGETS_FORUM_STRING in url: get_brand_name_from_gadget360(soup, mobile_brand_list, mobile_brand_links_list) elif SONY_FORUM_STRING in url: get_brand_name_forum_sonyforum(soup, mobile_brand_list, mobile_brand_links_list) return mobile_brand_list, mobile_brand_links_list
def sony_forum_get_issue(request, selected_model_links, selected_dates): date_list = [] url_list = [] product_list = [] category_list = [] user_comment_list = [] heading_name_list = [] for model_url in selected_model_links: pagination_url_list = [] pagination_list = pagination_for_thread_links(model_url) if pagination_list: for url in pagination_list: pagination_url_list.append(url) else: pagination_url_list.append(model_url) data_fetch_url_list = remove_dupilcate_link( get_thread_link_from_sony_forum(pagination_url_list, selected_dates)) for url in data_fetch_url_list: soup = parse(url) for issue_container in soup.find_all( "div", class_="lia-linear-display-message-view"): local_date = issue_container.find('span', class_='local-friendly-date') issue_local_date = issue_container.find('span', class_='local-date') issue_date = "" if local_date: if local_date.has_attr('title'): issue_date = local_date['title'][1:11] else: pass elif issue_local_date: issue_date = issue_local_date.get_text() match = re.search('\d{4}-\d{2}-\d{2}', issue_date.strip('\u200e')) product_date = datetime.strptime(match.group(), '%Y-%m-%d').date() converted_date = product_date.strftime('%m/%d/%Y') cdate = datetime.strptime(converted_date, '%m/%d/%Y').date() sdate = datetime.strptime(selected_dates[-1], '%m/%d/%Y').date() s1date = datetime.strptime(selected_dates[0], '%m/%d/%Y').date() child_node = issue_container.find( 'div', class_='lia-message-body-content') if (cdate > sdate): break elif (cdate < s1date): pass else: for date in selected_dates: if date == converted_date: date_list.append(issue_date.strip('\u200e')) url_list.append(url) product_name = soup.find( "a", class_= "lia-link-navigation crumb-board lia-breadcrumb-board lia-breadcrumb-forum" ).get_text() product_list.append(product_name) thread = soup.find( "span", class_="lia-link-navigation lia-link-disabled" ).get_text() heading_name_list.append(thread) issue_data, category = generic_category_filter( child_node) user_comment_list.append(issue_data) category_list.append(category) data_dictionary = { "Product": product_list, "Date": date_list, "Category": category_list, "Thread": heading_name_list, "Link": url_list, "Comment": user_comment_list } if not product_list: data_dictionary = {} else: file_writer = fileReaderWriter() file_writer.write_data_using_pandas(request, data_dictionary) return data_dictionary
def get_thread_link_from_sony_forum(pagination_url_list, selected_dates): issue_links_list = [] for url in pagination_url_list: soup = parse(url) for product_container in soup.findAll( "div", {"class": "lia-component-messages-column-message-info"}): product_cont = product_container.find( "a", {"class": "page-link lia-link-navigation lia-custom-event"}) product_links = product_cont.attrs["href"] issue_url = SONY_FORUM_URL + product_links for Check_date in product_container.findAll( 'span', {"class": "local-friendly-date"}): issue_dates = Check_date['title'][1:11] product_date = datetime.strptime(issue_dates, '%Y-%m-%d').date() converted_date = product_date.strftime('%m/%d/%Y') cdate = datetime.strptime(converted_date, '%m/%d/%Y').date() sdate = datetime.strptime(selected_dates[-1], '%m/%d/%Y').date() s1date = datetime.strptime(selected_dates[0], '%m/%d/%Y').date() if (cdate > sdate): break elif (cdate < s1date): pass else: for date in selected_dates: if date == converted_date.strip('\u200e'): # Pagination code: If each issue has more than one page enter this code if product_container.find( "ul", class_="lia-list-standard-inline"): issue_soup = parse(issue_url) page_url = issue_url + "/page/%s" issue_link = issue_soup.find( "div", { "class": "lia-quilt-row lia-quilt-row-main" }) page_link = issue_link.find( "div", { "class": "lia-paging-full-wrapper lia-paging-pager lia-paging-full-left-position lia-discussion-page-message-pager lia-component-message-pager" }) if page_link: last_pages = page_link.find( "ul", {"class": "lia-paging-full-pages"}) # get page number from the above html tag page_text = last_pages.text number_list = re.findall(r'\d+', page_text) # Get the last number from the list of numbers list_number = number_list[-1] for i in range(1, int(list_number) + 1): urls = page_url % i # make a url list and iterate over it issue_links_list.append(urls) else: pass else: issue_links_list.append(issue_url) return issue_links_list
def android_forum_get_issue(request, selected_model_links, selected_dates): date_list = [] url_list = [] product_list = [] category_list = [] user_comment_list = [] heading_name_list = [] for model_url in selected_model_links: pagination_url_list = [] pagination_list = pagination_for_user_comment_links(model_url) if pagination_list: for url in pagination_list: pagination_url_list.append(url) else: pagination_url_list.append(model_url) dic_thread_name = get_thread_link_from_android_forum( pagination_url_list, selected_dates) for thread_url, thread_name in dic_thread_name.items(): complete_url = ANDROID_FORUM_URL + thread_url soup = parse(complete_url) for node in soup.find_all("div", class_="messageInfo primaryContent"): child_node_date = node.find("a", class_="datePermalink") comment_date = child_node_date.text strip_date = comment_date.strip() pattern = re.compile( "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\s+\d{1,2},\s+\d{4}" ) date = pattern.search(strip_date).group() converted_date = datetime.strptime( date, '%b %d, %Y').strftime('%m/%d/%Y') cdate = datetime.strptime(converted_date, '%m/%d/%Y').date() sdate = datetime.strptime(selected_dates[-1], '%m/%d/%Y').date() s1date = datetime.strptime(selected_dates[0], '%m/%d/%Y').date() child_node = node.find("div", class_="messageContent") if (cdate > sdate): break elif (cdate < s1date): pass else: for date in selected_dates: if date == converted_date: date_list.append(converted_date.strip('\u200e')) heading_name_list.append(thread_name[0]) product_list.append(thread_name[1]) url_list.append(complete_url) issue_data, category = generic_category_filter( child_node) user_comment_list.append(issue_data) category_list.append(category) data_dictionary = { "Product": product_list, "Date": date_list, "Category": category_list, "Thread": heading_name_list, "Link": url_list, "Comment": user_comment_list } if not product_list: data_dictionary = {} else: file_writer = fileReaderWriter() file_writer.write_data_using_pandas(request, data_dictionary) return data_dictionary
def get_models_names_from_android_pit_forum(request,soup,url): mobile_model_name_list = [] mobile_model_links_list = [] mobile_model_year_list = [] dic_model_name = defaultdict(list) dic_year = defaultdict(list) sub_cat_link = [] main_url = [] mobile_brand = request.session.get('brand') if soup.find("a", class_="forumSubcategory"): for subCatLink in soup.find_all("a", class_="forumSubcategory"): sub_cat_link.append(ANDROID_PIT_FORUM_URL + subCatLink.attrs["href"]) if not sub_cat_link: soup = parse(url) product = soup.find("div", class_="forumHeadingWithFavorite") product_name = product.find("h1") mobile_model_name_list.append(product_name.text) mobile_model_links_list.append(url) mobile_model_year_list.append("All") else: for forumLink in sub_cat_link: soup = parse(forumLink) product = soup.find("div", class_="forumHeadingWithFavorite") product_name_head = product.find("h1") if soup.find("a", class_ = "forumSubcategory"): for subCatLink in soup.find_all("a", class_ = "forumSubcategory"): product_name = subCatLink.find("h3") if "/" in product_name.text: main_url.append(ANDROID_PIT_FORUM_URL) mobile_model_name_list.append(product_name_head.text) mobile_model_links_list.append(subCatLink.attrs["href"]) mobile_model_year_list.append("All") else: main_url.append(ANDROID_PIT_FORUM_URL) mobile_model_name_list.append(product_name.text) mobile_model_links_list.append(ANDROID_PIT_FORUM_URL +subCatLink.attrs["href"]) mobile_model_year_list.append("All") else: soup = parse(forumLink) product = soup.find("div", class_ = "forumHeadingWithFavorite") product_name = product.find("h1") main_url.append(ANDROID_PIT_FORUM_URL) mobile_model_name_list.append(product_name.text) mobile_model_links_list.append(forumLink) mobile_model_year_list.append("All") model_dictionary = {MAIN_URL_KEY:main_url, BRAND_NAME_KEY: mobile_brand, ANNOUNCED_YEAR_DICT_KEY: mobile_model_year_list, MODEL_NAME_DICT_KEY: mobile_model_name_list, MODEL_LINK_DICT_KEY: mobile_model_links_list} Write_to_DB(model_dictionary, MODEL_NAME_DATABASE_TABLE) i = 0 for key in mobile_model_year_list: dic_year[key].append(mobile_model_name_list[i]) i += 1 j = 0 for mobile_name_key in mobile_model_name_list: dic_model_name[mobile_name_key].append(mobile_model_links_list[j]) j += 1 return dic_year, dic_model_name