def __init__(self, _path, _conn=None, _rbc=None, _id=None, enc='utf-8'): """ reads text file :param path: path to text file :param c: coursor to news.db """ self.conn = _conn self.rbc = (_rbc == 1) self.id = _id self.path = _path self.header = '' self.txt = '' body_exp = re.compile(r'Body:(.*)') cat_exp = re.compile(r'^Category: (.+)') header_exp = re.compile(r'^Header: (.+)') enc_arr = ['cp1251', 'utf-8'] lines = [] for enc in enc_arr: file = open(_path, 'r', encoding=enc) try: lines = file.readlines() flg_wrong = False for line in lines: if chr(176) in line: flg_wrong = True break if not flg_wrong: file.close() break except: pass if lines: if not _rbc: flg_body = False for line in lines: cat_match = re.match(cat_exp, line) if cat_match: self.source = cat_match.group(1) header_match = re.match(header_exp, line) if header_match: self.header = header_match.group(1) if flg_body: self.txt += line else: body_match = re.match(body_exp, line) if body_match: flg_body = True else: for line in lines: line = sf.clear_string(line, sf.rus_letters+sf.lat_letters+sf.digits+sf.puncts+' ') if line: self.header = line break self.txt = ''.join(lines) self.source = 'rbc' self.size = len(self.txt)
def text_to_vector(self, l_obj=None): """ clears text and fill index dictionary with word-count% format :return: None """ self.vector = {} # index dictionary with word-count% format # clearing block exp_arr = [r'(&[^;]+;)',r'(<[^>]+>)'] txt = self.header + self.txt for str in exp_arr: exp = re.compile(str) txt = re.sub(exp,' ',txt) txt = txt.replace(' ',' ') txt = txt.replace('\n',' ') txt = self.remove_digits(txt) txt = sf.clear_string(txt,sf.rus_letters+sf.lat_letters+' '+'-'+r'\n') txt = txt.lower() txt_arr = self._text_to_arr(txt) txt_arr = self._remove_prepositions(txt_arr) if l_obj: stam_arr = [] for wrd in txt_arr: stam_arr.append(l_obj.get_stam(wrd)) else: stam_arr = txt_arr # count words for word in txt_arr: if self.vector.get(word): self.vector[word] += 1 else: self.vector[word] = 1
def read_articles_data(self): self.load_new_links_from_db() for link in self.new_links: self.driver.get(link) try: title = sf.clear_string(self.driver.find_element_by_class_name('heading-big').text, sf.digits + sf.rus_letters + sf.lat_letters + sf.puncts + " ") cnt_views = int(sf.clear_string(self.driver.find_element_by_class_name('views-value').text, sf.digits)) date_publ = self.driver.find_element_by_class_name('article-info-date').text text_len = len(self.driver.find_element_by_class_name('article__main-content').text) self.c.execute("UPDATE links SET cntViews={} WHERE link='{}'".format(cnt_views, link)) self.c.execute("UPDATE links SET title='{}' WHERE link='{}'".format(title, link)) self.c.execute("UPDATE links SET datePubl='{}' WHERE link='{}'".format(date_publ, link)) self.c.execute("UPDATE links SET textLen={} WHERE link='{}'".format(text_len, link)) print("{}: views {}, length {}".format(title, cnt_views, text_len)) self.conn.commit() except: pass time.sleep(1)
def GetFilmData(url): if 'https://' not in url: url = 'https://' + url page = requests.get(url) soup = BeautifulSoup(page.content, "html.parser") elements = soup.findAll('div', 'b-object-summary') if elements: # get name name = genre = producer = country = '' year = 0 names_data = elements[0].findAll('div', 'b-object-header') if names_data: name = names_data[0].contents[1].contents[1] name = name.replace('\n', '').strip() name = sf.clear_string( name, sf.rus_letters + sf.lat_letters + sf.puncts + sf.digits) else: print("Error parsling name in {}".format(url)) return 0 additional_data = elements[0].findAll('div', 'm-margin-btm') if additional_data: # get genre genres = additional_data[0].findAll('div', 'b-tags') if genres: genre = genres[0].text.replace('\n', '') genre = sf.clear_string( genre, sf.rus_letters + sf.lat_letters + sf.puncts) # get country countries = additional_data[0].findAll('span', 'creation') country_arr = [] if countries: country_arr = countries[0].text.split(',') for c_id in range(len(country_arr) - 2): country += country_arr[c_id] + ',' country = country[:len(country) - 1] country = sf.clear_string( country, sf.rus_letters + sf.lat_letters + sf.puncts) # get year if len(country_arr) > 1: for i in range(len(country_arr), 0, -1): try: year = int(country_arr[i]) if year in range(1900, 2020): break except: pass # get producer flg_found = False try: for cont in additional_data[0].contents: if flg_found: producer = sf.clear_string( cont.text, sf.rus_letters + sf.lat_letters + sf.puncts) break if 'Режиссер' in cont: flg_found = True except: print("Error parsling producer in {}".format(url)) else: print("Error parsling additional data in {}".format(url)) return 0 return FilmItem(name, year, genre, producer, country)
def GetFilmUsers(film_link, request_headers): print('Film {}: collecting users'.format(film_link)) film_id_arr = film_link.split('/')[4].split('-') film_id = film_id_arr[len(film_id_arr) - 1] i = 0 users = [] with requests.Session() as session: request_headers['Referer'] = film_link session.headers = request_headers # get token response = session.get(film_link) soup = BeautifulSoup(response.content, 'html.parser') # check if item is not series and have russian version and was issued in 2000 or later parent_div = soup.findAll('div', 'parent') if parent_div: year_str = parent_div[0].findAll('span', 'nobr') if '–' in year_str[0].text: return year = sf.clear_string(year_str[0].text, sf.digits) if int(year) < 2000: return is_russian = False for cont in parent_div[0].contents: if hasattr(cont, 'contents') and hasattr(cont, 'text'): for letter in sf.rus_letters: if letter in cont.text: is_russian = True break if is_russian: break if not is_russian: return elements = soup.findAll('div', 'lister-item') if elements: for el in elements: user_link_row = el.findAll('span', 'display-name-link') if user_link_row: user_link = user_link_row[0].contents[0].attrs['href'] user_id = user_link[8:len(user_link) - 13] users.append(user_id) pattern = re.compile('data-key="(.*)\sd') token_search = pattern.search(response.text) if token_search: # next page is founded token = token_search.group(1) token = token[:len(token) - 1] token_link = film_link.split('?')[0] prev_review_cnt = 0 while 1: # adding new reviews time.sleep(1) i += 1 print('-{}'.format(i)) if token: response = session.get( '{}/_ajax?ref_=undefined&paginationKey={}'.format( token_link, token)) else: response = session.get(film_link) soup = BeautifulSoup(response.content, 'html.parser') elements = soup.findAll('div', 'lister-item') if elements: for el in elements: user_link_row = el.findAll('span', 'display-name-link') if user_link_row: user_link = user_link_row[0].contents[0].attrs[ 'href'] user_id = user_link[8:len(user_link) - 13] users.append(user_id) pattern = re.compile('data-key="(.*)"') try: token = pattern.search(response.text).group(1) except: break return users
def GetUserData(user_page, request_headers): print('Getting ratings from user ' + user_page) film_link_mask = 'http://www.imdb.com/title/{}/reviews?ref_=tt_urv' user_link_prefix = 'http://www.imdb.com' result = [] film_link = '' rating = -1 with requests.Session() as session: session.headers = request_headers i = 0 while 1: i += 1 time.sleep(1) try: page = session.get(user_page) except: break soup = BeautifulSoup(page.content, "html.parser") elements = soup.findAll( 'div', 'lister-item-content') # get table with marks for el in elements: is_series = False el_data = el.findAll('h3', 'lister-item-header') if el_data: # get film link for cont in el_data[0].contents: if hasattr(cont, 'attrs'): if cont.attrs.get('href'): film_id = cont.attrs.get('href').split('/')[2] film_link = film_link_mask.format(film_id) # check if item is series if cont.attrs.get( 'class' ) and 'lister-item-year' in cont.attrs.get( 'class'): if ('–') in cont.text: is_series = True break else: year = sf.clear_string( cont.text, sf.digits) if not year or int(year) < 2000: is_series = True break if not is_series: # do not take series ratings or films earlier 2000 year # get mark rating_widget = el.findAll('div', 'ipl-rating-widget') if rating_widget: for cont in rating_widget[0].contents: if hasattr(cont, 'attrs'): if 'ipl-rating-star--other-user' in cont.attrs.get( 'class'): rating = cont.text.replace('\n', '') result.append(ReviewItem(film_link, rating, '')) # look for next page paginator = soup.findAll('div', 'list-pagination') user_page = '' if paginator: for cont in paginator[0].contents: if hasattr(cont, 'attrs'): user_page = cont.attrs.get('href') if not user_page: break user_page = user_link_prefix + user_page print('-{}'.format(i)) return result
if len(el.contents) > 10: # get movie data country = "" producer = "" genre = "" link = el.contents[1].attrs['href'][2:] marksNum = int(el.contents[3].contents[1].text) midMarkStr = el.contents[3].contents[3].contents[0].contents[ 3].text midMarkArr = midMarkStr.split(":") midMark = midMarkArr[1].strip() midMarkStr = midMark[0:3] if midMarkStr.find(",") != -1: midMark = float(midMarkStr.replace(",", ".")) else: midMarkStr = sf.clear_string(midMarkStr, sf.digits) midMark = float(midMarkStr) filmName = sf.clear_string(el.contents[5].contents[1].text, legitimate_symbols) contInd = 7 if len(el.contents[contInd].contents) < 3: contInd += 2 if hasattr(el.contents[contInd], "text"): if el.contents[contInd].text != "": # genre = el.contents[contInd].contents[1].text[0:len(el.contents[contInd].contents[1].text)-9] genre = el.contents[contInd].contents[1].text contInd += 2 if len(el.contents) > contInd: if el.contents[contInd].text.find("Режиссер") == -1: contInd += 2 if len(el.contents) > contInd:
def __init__(self, _path, _conn=None, _rbc=None, _id=None, enc='utf-8'): """ reads text file :param path: path to text file :param c: coursor to news.db """ self.conn = _conn self.rbc = (_rbc == 1) self.id = _id self.path = _path self.header = '' self.txt = '' body_exp = re.compile(r'Body:(.*)') cat_exp = re.compile(r'^Category: (.+)') header_exp = re.compile(r'^Header: (.+)') enc_arr = ['cp1251', 'utf-8'] lines = [] for enc in enc_arr: file = open(_path, 'r', encoding=enc) try: lines = file.readlines() flg_wrong = False for line in lines: if chr(176) in line: flg_wrong = True break if not flg_wrong: file.close() break except: pass if lines: if not _rbc: flg_body = False for line in lines: cat_match = re.match(cat_exp, line) if cat_match: self.source = cat_match.group(1) header_match = re.match(header_exp, line) if header_match: self.header = header_match.group(1) if flg_body: self.txt += line else: body_match = re.match(body_exp, line) if body_match: flg_body = True else: for line in lines: line = sf.clear_string( line, sf.rus_letters + sf.lat_letters + sf.digits + sf.puncts + ' ') if line: self.header = line break self.txt = ''.join(lines) self.source = 'rbc' self.size = len(self.txt)
while 1: #go through all the film's reviews txt = driver.execute_script("return document.body.innerHTML") soup = BeautifulSoup(''.join(txt), "html.parser") elements = soup.find_all("div", review_page_name) for el in elements: #get review data user_id = 0 mark_str = el.contents[1].contents[1].contents[1].contents[3].text if mark_str.find(":") != -1: mark_str_arr = mark_str.split(":") mark_str = mark_str_arr[1].strip() mark_str = mark_str[0:2].strip() else: mark_str = "0" user_name = el.contents[3].contents[3].contents[1].text user_name = sf.clear_string(user_name, legitimate_symbols) user_link = el.contents[3].contents[3].contents[1].contents[ 0].attrs['href'][2:] #save new user c_user.execute("SELECT id FROM users WHERE link='" + user_link + "'") for user in c_user.fetchall(): user_id = user[0] if not user_id: c_user.execute("INSERT INTO users (name, link) VALUES ('" + user_name + "', '" + user_link + "')") conn.commit() c_user.execute("SELECT id FROM users WHERE link='" + user_link + "'") user_id = c_user.fetchone()[0]
def read_link(link): result = {} r = requests.get(link) soup = BeautifulSoup(r.text, 'html.parser') name_el = soup.find_all("h1", "mediaCardHeader__cardHeaderName") if not name_el: #small card name_el = soup.find_all("h1", "cardHeader__headerNameText") if not name_el: name_el = soup.find_all("div", "card__headerWrapper") if name_el: val = name_el[0].text.split(',')[0] result['brand'] = sf.clear_string( val, sf.rus_letters + sf.lat_letters + sf.digits + sf.puncts + ' ') comp_type = soup.find_all("div", "cardHeader__headerDescriptionText") if comp_type: result["compType"] = comp_type[0].text addr_el = soup.find_all("div", "mediaCardHeader__cardAddressName") if not addr_el: addr_el = soup.find_all("a", "card__addressLink") if addr_el: result['addr'] = sf.clear_string( addr_el[0].text, sf.rus_letters + sf.lat_letters + sf.digits + sf.puncts + ' ' + '/') dop_addr_el = soup.find_all("div", "mediaAddress__drilldown") if not dop_addr_el: dop_addr_el = soup.find_all("div", "_purpose_drilldown") if dop_addr_el: result['addr_dop'] = sf.clear_string( dop_addr_el[0].text, sf.rus_letters + sf.lat_letters + sf.digits + sf.puncts + ' ' + '/') """ addr_arr = result['addr'].split(',') if len(addr_arr) > 1: result['cityName'] = addr_arr[1] else: result['cityName'] = result['addr'] """ tel_el = soup.find_all("a", "contact__phonesItemLink") if not tel_el: tel_el = soup.find_all("a", "mediaContacts__phonesNumber") if tel_el: result['tel'] = tel_el[0].text rubric_el = soup.find_all("div", "cardRubrics__rubrics") if not rubric_el: rubric_el = soup.find_all("div", "mediaAttributes__rubrics") if rubric_el: txt = "" for el in rubric_el[0].contents: txt += el.text + "|" txt = txt[:len(txt) - 1] result['gisCatStr'] = txt result['gisCatMain'] = txt.split("|")[0] website_el = soup.find_all("div", "card__legal") if not website_el: website_el = soup.find_all("a", "mediaContacts__website") if not website_el: website_el = soup.find_all("a", "contact__linkText") if website_el: if hasattr(website_el[0], 'attrs'): result['website'] = website_el[0].get('title') else: result['website'] = website_el[0].text description_el = soup.find_all("li", "cardAttributes__attrsListItem") if not description_el: description_el = soup.find_all("ul", "mediaAttributes__groupList") descr_field = '' if description_el: for el in description_el[0].contents: descr_field += el.text + ';' descr_field = descr_field[0:-1] result['descr'] = descr_field # get stars stars_pos = result['descr'].find('звезд') if stars_pos != -1: result['stars'] = result['descr'][stars_pos - 2:stars_pos - 1] # get restaurant rest_pos = result['descr'].find('естор') if rest_pos != -1: result['hasRest'] = 1 # get price bill_offset = 5 bill_pos = result['descr'].find(' чек ') if bill_pos == -1: bill_pos = result['descr'].find(' чек') bill_offset = 4 if bill_pos == -1: bill_pos = result['descr'].find(' от ') bill_offset = 4 if bill_pos != -1: bill = '' while sf.is_digit(result['descr'][bill_pos + bill_offset]): bill += str(result['descr'][bill_pos + bill_offset]) bill_pos += 1 result['bill'] = bill return result
def seek_industries_search_bar(dbPath, driverPath): # seeks for companies in all cities; fills db table # load objects to seek categories = {} conn = lite.connect(dbPath) c = conn.cursor() c.execute("SELECT id, name FROM searches WHERE isChecked=0") for obj_row in c.fetchall(): categories[obj_row[0]] = obj_row[1] cities_to_look = [ 'Москва', 'Санкт-Петербург', 'Новосибирск', 'Екатеринбург', 'Нижний Новгород', 'Казань', 'Челябинск', 'Омск', 'Самара', 'Ростов-на-Дону', 'Уфа', 'Красноярск', 'Пермь', 'Воронеж', 'Волгоград' ] cr_city = "" visited_links = [] start_link = "https://2gis.ru/countries/global/moscow?queryState=center%2F27.224121%2C55.751849%2Fzoom%2F5" # the link to enter a new city flg_reload_categories = False while 1: # go through cities driver = webdriver.Chrome(driverPath) driver.get(start_link) driver.maximize_window() if not flg_reload_categories: driver, cr_city = get_next_city(driver, cr_city) if cr_city == "": break time.sleep(2) while 1: # go through categories if cr_city not in cities_to_look: #if cr_city != 'Екатеринбург': driver.close() flg_reload_categories = False break cat_name = get_next_category(categories, cr_city, conn) if not cat_name: driver.close() flg_reload_categories = False break # go to the next city seek_form = driver.find_elements_by_class_name("suggest__input") if len(seek_form) > 0: seek_form[0].clear() seek_form[0].send_keys(cat_name) btn = driver.find_elements_by_class_name("searchBar__submit") if len(btn) > 0: btn[0].click() time.sleep(2) while 1: # go through companies in current category cards = driver.find_elements_by_class_name( "miniCard__content") if cards: for card in cards: # collect data for each company on the page # driver.execute_script("window.scrollBy(0," + str(card.location['y'] - 200) + ")") driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);" ) try: card.click() data_dict = {} data_dict["catName"] = [cat_name, True] data_dict["cityName"] = [cr_city, True] time.sleep(3) # read and save data # txt = driver.execute_script("return document.body.innerHTML") # soup = BeautifulSoup(''.join(txt), 'html.parser') # brand_name = sf.clear_string(driver.find_element_by_class_name("cardHeader__headerNameText").text, sf.rus_letters+sf.lat_letters+sf.digits+sf.puncts+" ") brand_name = sf.clear_string( card.find_element_by_class_name( 'miniCard__headerTitleLink').text, sf.rus_letters + sf.lat_letters + sf.digits + sf.puncts + " ") data_dict["brandName"] = [brand_name, True] except: pass try: addr = driver.find_element_by_class_name( "card__addressLink").text data_dict["addr"] = [addr, True] except: pass try: web_site = driver.find_element_by_class_name( "link").text data_dict["webSite"] = [web_site, True] except: pass try: filials_num = sf.clear_string( driver.find_element_by_class_name( "card__filialsLink").text, sf.digits) data_dict["filialsNum"] = [ filials_num, True ] except: pass cmd = sf.build_insert_expression( data_dict, "output_categories") sf.execute_query(conn, cmd, 3) try: css_arr = [ "a.link.frame__controlsButton._back._undashed", "a.link.frame__controlsButton._close._undashed" ] for css in css_arr: if click_closing_button(driver, css): break else: print( 'Did not find closing button for:' + driver.current_url) except: pass else: # got a single card data_dict, back_link = read_single_card(driver) if len(data_dict) == 0: print( "unable to read data: city = {}, category = {}, link = {}" .format(cr_city, cat_name, driver.current_url)) else: data_dict["catName"] = [cat_name, True] data_dict["cityName"] = [cr_city, True] cmd = sf.build_insert_expression( data_dict, "output_categories") sf.execute_query(conn, cmd, 3) if back_link.location_once_scrolled_into_view[ 'x'] > 0 and back_link.location_once_scrolled_into_view[ 'y'] > 0: try: back_link.click() time.sleep(2) flg_reload_categories = True except: pass break # load next page try: next_link_disabled = driver.find_element_by_css_selector( "div.pagination__arrow._right._disabled") click_element( driver, "a.link.searchBar__mediaButton.searchBar__mediaClose._undashed", False) flg_reload_categories = True break # a disabled next arrow found - the end of the list except: next_link = driver.find_element_by_css_selector( "div.pagination__arrow._right") next_link.click() time.sleep(2) finally: flg_reload_categories = True break sf.execute_query( conn, "INSERT INTO checkedData (obj, city) VALUES ('{}', '{}')" .format(cat_name, cr_city)) driver.close()