コード例 #1
0
ファイル: news_parser.py プロジェクト: DeadLekar/news
    def __init__(self, _path, _conn=None, _rbc=None, _id=None, enc='utf-8'):
        """
        reads text file
        :param path: path to text file
        :param c: coursor to news.db
        """
        self.conn = _conn
        self.rbc = (_rbc == 1)
        self.id = _id
        self.path = _path
        self.header = ''
        self.txt = ''
        body_exp = re.compile(r'Body:(.*)')
        cat_exp = re.compile(r'^Category: (.+)')
        header_exp = re.compile(r'^Header: (.+)')

        enc_arr = ['cp1251', 'utf-8']
        lines = []
        for enc in enc_arr:
            file = open(_path, 'r', encoding=enc)
            try:
                lines = file.readlines()
                flg_wrong = False
                for line in lines:
                    if chr(176) in line:
                        flg_wrong = True
                        break
                if not flg_wrong:
                    file.close()
                    break
            except:
                pass

        if lines:
            if not _rbc:
                flg_body = False
                for line in lines:
                    cat_match = re.match(cat_exp, line)
                    if cat_match:
                        self.source = cat_match.group(1)

                    header_match = re.match(header_exp, line)
                    if header_match:
                        self.header = header_match.group(1)

                    if flg_body:
                        self.txt += line
                    else:
                        body_match = re.match(body_exp, line)
                        if body_match:
                            flg_body = True
            else:
                for line in lines:
                    line = sf.clear_string(line, sf.rus_letters+sf.lat_letters+sf.digits+sf.puncts+' ')
                    if line:
                        self.header = line
                        break
                self.txt = ''.join(lines)
                self.source = 'rbc'
            self.size = len(self.txt)
コード例 #2
0
ファイル: news_parser.py プロジェクト: DeadLekar/news
    def text_to_vector(self, l_obj=None):
        """
        clears text and fill index dictionary with word-count% format
        :return: None
        """
        self.vector = {}  # index dictionary with word-count% format
        # clearing block
        exp_arr = [r'(&[^;]+;)',r'(<[^>]+>)']
        txt = self.header + self.txt
        for str in exp_arr:
            exp = re.compile(str)
            txt = re.sub(exp,' ',txt)
        txt = txt.replace('  ',' ')
        txt = txt.replace('\n',' ')
        txt = self.remove_digits(txt)
        txt = sf.clear_string(txt,sf.rus_letters+sf.lat_letters+' '+'-'+r'\n')
        txt = txt.lower()

        txt_arr = self._text_to_arr(txt)
        txt_arr = self._remove_prepositions(txt_arr)

        if l_obj:
            stam_arr = []
            for wrd in txt_arr:
                stam_arr.append(l_obj.get_stam(wrd))
        else: stam_arr = txt_arr

        # count words
        for word in txt_arr:
            if self.vector.get(word):
                self.vector[word] += 1
            else:
                self.vector[word] = 1
コード例 #3
0
ファイル: webSites.py プロジェクト: DeadLekar/web_loader
 def read_articles_data(self):
     self.load_new_links_from_db()
     for link in self.new_links:
         self.driver.get(link)
         try:
             title = sf.clear_string(self.driver.find_element_by_class_name('heading-big').text, sf.digits + sf.rus_letters + sf.lat_letters + sf.puncts + " ")
             cnt_views = int(sf.clear_string(self.driver.find_element_by_class_name('views-value').text, sf.digits))
             date_publ = self.driver.find_element_by_class_name('article-info-date').text
             text_len = len(self.driver.find_element_by_class_name('article__main-content').text)
             self.c.execute("UPDATE links SET cntViews={} WHERE link='{}'".format(cnt_views, link))
             self.c.execute("UPDATE links SET title='{}' WHERE link='{}'".format(title, link))
             self.c.execute("UPDATE links SET datePubl='{}' WHERE link='{}'".format(date_publ, link))
             self.c.execute("UPDATE links SET textLen={} WHERE link='{}'".format(text_len, link))
             print("{}: views {}, length {}".format(title, cnt_views, text_len))
             self.conn.commit()
         except: pass
         time.sleep(1)
コード例 #4
0
def GetFilmData(url):
    if 'https://' not in url: url = 'https://' + url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    elements = soup.findAll('div', 'b-object-summary')
    if elements:
        # get name
        name = genre = producer = country = ''
        year = 0
        names_data = elements[0].findAll('div', 'b-object-header')
        if names_data:
            name = names_data[0].contents[1].contents[1]
            name = name.replace('\n', '').strip()
            name = sf.clear_string(
                name, sf.rus_letters + sf.lat_letters + sf.puncts + sf.digits)
        else:
            print("Error parsling name in {}".format(url))
            return 0

        additional_data = elements[0].findAll('div', 'm-margin-btm')
        if additional_data:
            # get genre
            genres = additional_data[0].findAll('div', 'b-tags')
            if genres:
                genre = genres[0].text.replace('\n', '')
                genre = sf.clear_string(
                    genre, sf.rus_letters + sf.lat_letters + sf.puncts)

            # get country
            countries = additional_data[0].findAll('span', 'creation')
            country_arr = []
            if countries:
                country_arr = countries[0].text.split(',')
                for c_id in range(len(country_arr) - 2):
                    country += country_arr[c_id] + ','
                country = country[:len(country) - 1]
                country = sf.clear_string(
                    country, sf.rus_letters + sf.lat_letters + sf.puncts)

            # get year
            if len(country_arr) > 1:
                for i in range(len(country_arr), 0, -1):
                    try:
                        year = int(country_arr[i])
                        if year in range(1900, 2020): break
                    except:
                        pass

            # get producer
            flg_found = False
            try:
                for cont in additional_data[0].contents:
                    if flg_found:
                        producer = sf.clear_string(
                            cont.text,
                            sf.rus_letters + sf.lat_letters + sf.puncts)
                        break
                    if 'Режиссер' in cont:
                        flg_found = True
            except:
                print("Error parsling producer in {}".format(url))

        else:
            print("Error parsling additional data in {}".format(url))
            return 0

        return FilmItem(name, year, genre, producer, country)
コード例 #5
0
def GetFilmUsers(film_link, request_headers):
    print('Film {}: collecting users'.format(film_link))
    film_id_arr = film_link.split('/')[4].split('-')
    film_id = film_id_arr[len(film_id_arr) - 1]
    i = 0
    users = []

    with requests.Session() as session:
        request_headers['Referer'] = film_link
        session.headers = request_headers
        # get token
        response = session.get(film_link)
        soup = BeautifulSoup(response.content, 'html.parser')

        # check if item is not series and have russian version and was issued in 2000 or later
        parent_div = soup.findAll('div', 'parent')
        if parent_div:
            year_str = parent_div[0].findAll('span', 'nobr')
            if '–' in year_str[0].text:
                return
            year = sf.clear_string(year_str[0].text, sf.digits)
            if int(year) < 2000:
                return
            is_russian = False
            for cont in parent_div[0].contents:
                if hasattr(cont, 'contents') and hasattr(cont, 'text'):
                    for letter in sf.rus_letters:
                        if letter in cont.text:
                            is_russian = True
                            break
                    if is_russian: break
            if not is_russian:
                return

        elements = soup.findAll('div', 'lister-item')
        if elements:
            for el in elements:
                user_link_row = el.findAll('span', 'display-name-link')
                if user_link_row:
                    user_link = user_link_row[0].contents[0].attrs['href']
                    user_id = user_link[8:len(user_link) - 13]
                    users.append(user_id)

        pattern = re.compile('data-key="(.*)\sd')
        token_search = pattern.search(response.text)
        if token_search:  # next page is founded
            token = token_search.group(1)
            token = token[:len(token) - 1]
            token_link = film_link.split('?')[0]
            prev_review_cnt = 0
            while 1:  # adding new reviews
                time.sleep(1)
                i += 1
                print('-{}'.format(i))
                if token:
                    response = session.get(
                        '{}/_ajax?ref_=undefined&paginationKey={}'.format(
                            token_link, token))
                else:
                    response = session.get(film_link)
                soup = BeautifulSoup(response.content, 'html.parser')
                elements = soup.findAll('div', 'lister-item')
                if elements:
                    for el in elements:
                        user_link_row = el.findAll('span', 'display-name-link')
                        if user_link_row:
                            user_link = user_link_row[0].contents[0].attrs[
                                'href']
                            user_id = user_link[8:len(user_link) - 13]
                            users.append(user_id)
                pattern = re.compile('data-key="(.*)"')
                try:
                    token = pattern.search(response.text).group(1)
                except:
                    break
    return users
コード例 #6
0
def GetUserData(user_page, request_headers):
    print('Getting ratings from user ' + user_page)
    film_link_mask = 'http://www.imdb.com/title/{}/reviews?ref_=tt_urv'
    user_link_prefix = 'http://www.imdb.com'
    result = []
    film_link = ''
    rating = -1
    with requests.Session() as session:
        session.headers = request_headers
        i = 0
        while 1:
            i += 1
            time.sleep(1)
            try:
                page = session.get(user_page)
            except:
                break
            soup = BeautifulSoup(page.content, "html.parser")
            elements = soup.findAll(
                'div', 'lister-item-content')  # get table with marks
            for el in elements:
                is_series = False
                el_data = el.findAll('h3', 'lister-item-header')
                if el_data:
                    # get film link
                    for cont in el_data[0].contents:
                        if hasattr(cont, 'attrs'):
                            if cont.attrs.get('href'):
                                film_id = cont.attrs.get('href').split('/')[2]
                                film_link = film_link_mask.format(film_id)
                            # check if item is series
                            if cont.attrs.get(
                                    'class'
                            ) and 'lister-item-year' in cont.attrs.get(
                                    'class'):
                                if ('–') in cont.text:
                                    is_series = True
                                    break
                                else:
                                    year = sf.clear_string(
                                        cont.text, sf.digits)
                                    if not year or int(year) < 2000:
                                        is_series = True
                                        break
                if not is_series:  # do not take series ratings or films earlier 2000 year
                    # get mark
                    rating_widget = el.findAll('div', 'ipl-rating-widget')
                    if rating_widget:
                        for cont in rating_widget[0].contents:
                            if hasattr(cont, 'attrs'):
                                if 'ipl-rating-star--other-user' in cont.attrs.get(
                                        'class'):
                                    rating = cont.text.replace('\n', '')
                        result.append(ReviewItem(film_link, rating, ''))
            # look for next page
            paginator = soup.findAll('div', 'list-pagination')
            user_page = ''
            if paginator:
                for cont in paginator[0].contents:
                    if hasattr(cont, 'attrs'):
                        user_page = cont.attrs.get('href')
            if not user_page: break
            user_page = user_link_prefix + user_page
            print('-{}'.format(i))
    return result
コード例 #7
0
 if len(el.contents) > 10:
     # get movie data
     country = ""
     producer = ""
     genre = ""
     link = el.contents[1].attrs['href'][2:]
     marksNum = int(el.contents[3].contents[1].text)
     midMarkStr = el.contents[3].contents[3].contents[0].contents[
         3].text
     midMarkArr = midMarkStr.split(":")
     midMark = midMarkArr[1].strip()
     midMarkStr = midMark[0:3]
     if midMarkStr.find(",") != -1:
         midMark = float(midMarkStr.replace(",", "."))
     else:
         midMarkStr = sf.clear_string(midMarkStr, sf.digits)
         midMark = float(midMarkStr)
     filmName = sf.clear_string(el.contents[5].contents[1].text,
                                legitimate_symbols)
     contInd = 7
     if len(el.contents[contInd].contents) < 3:
         contInd += 2
     if hasattr(el.contents[contInd], "text"):
         if el.contents[contInd].text != "":
             # genre = el.contents[contInd].contents[1].text[0:len(el.contents[contInd].contents[1].text)-9]
             genre = el.contents[contInd].contents[1].text
     contInd += 2
     if len(el.contents) > contInd:
         if el.contents[contInd].text.find("Режиссер") == -1:
             contInd += 2
     if len(el.contents) > contInd:
コード例 #8
0
    def __init__(self, _path, _conn=None, _rbc=None, _id=None, enc='utf-8'):
        """
        reads text file
        :param path: path to text file
        :param c: coursor to news.db
        """
        self.conn = _conn
        self.rbc = (_rbc == 1)
        self.id = _id
        self.path = _path
        self.header = ''
        self.txt = ''
        body_exp = re.compile(r'Body:(.*)')
        cat_exp = re.compile(r'^Category: (.+)')
        header_exp = re.compile(r'^Header: (.+)')

        enc_arr = ['cp1251', 'utf-8']
        lines = []
        for enc in enc_arr:
            file = open(_path, 'r', encoding=enc)
            try:
                lines = file.readlines()
                flg_wrong = False
                for line in lines:
                    if chr(176) in line:
                        flg_wrong = True
                        break
                if not flg_wrong:
                    file.close()
                    break
            except:
                pass

        if lines:
            if not _rbc:
                flg_body = False
                for line in lines:
                    cat_match = re.match(cat_exp, line)
                    if cat_match:
                        self.source = cat_match.group(1)

                    header_match = re.match(header_exp, line)
                    if header_match:
                        self.header = header_match.group(1)

                    if flg_body:
                        self.txt += line
                    else:
                        body_match = re.match(body_exp, line)
                        if body_match:
                            flg_body = True
            else:
                for line in lines:
                    line = sf.clear_string(
                        line, sf.rus_letters + sf.lat_letters + sf.digits +
                        sf.puncts + ' ')
                    if line:
                        self.header = line
                        break
                self.txt = ''.join(lines)
                self.source = 'rbc'
            self.size = len(self.txt)
コード例 #9
0
ファイル: get_users.py プロジェクト: DeadLekar/kinoman
    while 1:  #go through all the film's reviews
        txt = driver.execute_script("return document.body.innerHTML")
        soup = BeautifulSoup(''.join(txt), "html.parser")
        elements = soup.find_all("div", review_page_name)
        for el in elements:
            #get review data
            user_id = 0
            mark_str = el.contents[1].contents[1].contents[1].contents[3].text
            if mark_str.find(":") != -1:
                mark_str_arr = mark_str.split(":")
                mark_str = mark_str_arr[1].strip()
                mark_str = mark_str[0:2].strip()
            else:
                mark_str = "0"
            user_name = el.contents[3].contents[3].contents[1].text
            user_name = sf.clear_string(user_name, legitimate_symbols)
            user_link = el.contents[3].contents[3].contents[1].contents[
                0].attrs['href'][2:]

            #save new user
            c_user.execute("SELECT id FROM users WHERE link='" + user_link +
                           "'")
            for user in c_user.fetchall():
                user_id = user[0]
            if not user_id:
                c_user.execute("INSERT INTO users (name, link) VALUES ('" +
                               user_name + "', '" + user_link + "')")
                conn.commit()
                c_user.execute("SELECT id FROM users WHERE link='" +
                               user_link + "'")
                user_id = c_user.fetchone()[0]
コード例 #10
0
def read_link(link):
    result = {}
    r = requests.get(link)
    soup = BeautifulSoup(r.text, 'html.parser')

    name_el = soup.find_all("h1", "mediaCardHeader__cardHeaderName")
    if not name_el:  #small card
        name_el = soup.find_all("h1", "cardHeader__headerNameText")
        if not name_el:
            name_el = soup.find_all("div", "card__headerWrapper")
    if name_el:
        val = name_el[0].text.split(',')[0]
        result['brand'] = sf.clear_string(
            val, sf.rus_letters + sf.lat_letters + sf.digits + sf.puncts + ' ')

    comp_type = soup.find_all("div", "cardHeader__headerDescriptionText")
    if comp_type:
        result["compType"] = comp_type[0].text

    addr_el = soup.find_all("div", "mediaCardHeader__cardAddressName")
    if not addr_el:
        addr_el = soup.find_all("a", "card__addressLink")

    if addr_el:
        result['addr'] = sf.clear_string(
            addr_el[0].text, sf.rus_letters + sf.lat_letters + sf.digits +
            sf.puncts + ' ' + '/')
    dop_addr_el = soup.find_all("div", "mediaAddress__drilldown")
    if not dop_addr_el:
        dop_addr_el = soup.find_all("div", "_purpose_drilldown")
    if dop_addr_el:
        result['addr_dop'] = sf.clear_string(
            dop_addr_el[0].text, sf.rus_letters + sf.lat_letters + sf.digits +
            sf.puncts + ' ' + '/')
        """
        addr_arr = result['addr'].split(',')
        if len(addr_arr) > 1:
            result['cityName'] = addr_arr[1]
        else:
            result['cityName'] = result['addr']
        """

    tel_el = soup.find_all("a", "contact__phonesItemLink")
    if not tel_el:
        tel_el = soup.find_all("a", "mediaContacts__phonesNumber")
    if tel_el:
        result['tel'] = tel_el[0].text

    rubric_el = soup.find_all("div", "cardRubrics__rubrics")
    if not rubric_el:
        rubric_el = soup.find_all("div", "mediaAttributes__rubrics")
    if rubric_el:
        txt = ""
        for el in rubric_el[0].contents:
            txt += el.text + "|"
        txt = txt[:len(txt) - 1]
        result['gisCatStr'] = txt
        result['gisCatMain'] = txt.split("|")[0]

    website_el = soup.find_all("div", "card__legal")
    if not website_el:
        website_el = soup.find_all("a", "mediaContacts__website")
    if not website_el:
        website_el = soup.find_all("a", "contact__linkText")
        if website_el:
            if hasattr(website_el[0], 'attrs'):
                result['website'] = website_el[0].get('title')
    else:
        result['website'] = website_el[0].text

    description_el = soup.find_all("li", "cardAttributes__attrsListItem")
    if not description_el:
        description_el = soup.find_all("ul", "mediaAttributes__groupList")
    descr_field = ''
    if description_el:
        for el in description_el[0].contents:
            descr_field += el.text + ';'
        descr_field = descr_field[0:-1]
        result['descr'] = descr_field

        # get stars
        stars_pos = result['descr'].find('звезд')
        if stars_pos != -1:
            result['stars'] = result['descr'][stars_pos - 2:stars_pos - 1]

        # get restaurant
        rest_pos = result['descr'].find('естор')
        if rest_pos != -1:
            result['hasRest'] = 1

        # get price
        bill_offset = 5
        bill_pos = result['descr'].find(' чек ')
        if bill_pos == -1:
            bill_pos = result['descr'].find(' чек')
            bill_offset = 4
        if bill_pos == -1:
            bill_pos = result['descr'].find(' от ')
            bill_offset = 4

        if bill_pos != -1:
            bill = ''
            while sf.is_digit(result['descr'][bill_pos + bill_offset]):
                bill += str(result['descr'][bill_pos + bill_offset])
                bill_pos += 1
            result['bill'] = bill

    return result
コード例 #11
0
def seek_industries_search_bar(dbPath, driverPath):
    # seeks for companies in all cities; fills db table
    # load objects to seek

    categories = {}
    conn = lite.connect(dbPath)
    c = conn.cursor()
    c.execute("SELECT id, name FROM searches WHERE isChecked=0")
    for obj_row in c.fetchall():
        categories[obj_row[0]] = obj_row[1]

    cities_to_look = [
        'Москва', 'Санкт-Петербург', 'Новосибирск', 'Екатеринбург',
        'Нижний Новгород', 'Казань', 'Челябинск', 'Омск', 'Самара',
        'Ростов-на-Дону', 'Уфа', 'Красноярск', 'Пермь', 'Воронеж', 'Волгоград'
    ]
    cr_city = ""
    visited_links = []
    start_link = "https://2gis.ru/countries/global/moscow?queryState=center%2F27.224121%2C55.751849%2Fzoom%2F5"  # the link to enter a new city
    flg_reload_categories = False
    while 1:  # go through cities
        driver = webdriver.Chrome(driverPath)
        driver.get(start_link)
        driver.maximize_window()
        if not flg_reload_categories:
            driver, cr_city = get_next_city(driver, cr_city)
        if cr_city == "": break
        time.sleep(2)
        while 1:  # go through categories
            if cr_city not in cities_to_look:
                #if cr_city != 'Екатеринбург':
                driver.close()
                flg_reload_categories = False
                break

            cat_name = get_next_category(categories, cr_city, conn)
            if not cat_name:
                driver.close()
                flg_reload_categories = False
                break  # go to the next city
            seek_form = driver.find_elements_by_class_name("suggest__input")
            if len(seek_form) > 0:
                seek_form[0].clear()
                seek_form[0].send_keys(cat_name)
                btn = driver.find_elements_by_class_name("searchBar__submit")
                if len(btn) > 0:
                    btn[0].click()
                    time.sleep(2)
                    while 1:  # go through companies in current category
                        cards = driver.find_elements_by_class_name(
                            "miniCard__content")
                        if cards:
                            for card in cards:  # collect data for each company on the page
                                # driver.execute_script("window.scrollBy(0," + str(card.location['y'] - 200) + ")")
                                driver.execute_script(
                                    "window.scrollTo(0, document.body.scrollHeight);"
                                )
                                try:
                                    card.click()
                                    data_dict = {}
                                    data_dict["catName"] = [cat_name, True]
                                    data_dict["cityName"] = [cr_city, True]
                                    time.sleep(3)
                                    # read and save data
                                    # txt = driver.execute_script("return document.body.innerHTML")
                                    # soup = BeautifulSoup(''.join(txt), 'html.parser')
                                    # brand_name = sf.clear_string(driver.find_element_by_class_name("cardHeader__headerNameText").text, sf.rus_letters+sf.lat_letters+sf.digits+sf.puncts+" ")
                                    brand_name = sf.clear_string(
                                        card.find_element_by_class_name(
                                            'miniCard__headerTitleLink').text,
                                        sf.rus_letters + sf.lat_letters +
                                        sf.digits + sf.puncts + " ")
                                    data_dict["brandName"] = [brand_name, True]
                                except:
                                    pass
                                try:
                                    addr = driver.find_element_by_class_name(
                                        "card__addressLink").text
                                    data_dict["addr"] = [addr, True]
                                except:
                                    pass
                                try:
                                    web_site = driver.find_element_by_class_name(
                                        "link").text
                                    data_dict["webSite"] = [web_site, True]
                                except:
                                    pass
                                try:
                                    filials_num = sf.clear_string(
                                        driver.find_element_by_class_name(
                                            "card__filialsLink").text,
                                        sf.digits)
                                    data_dict["filialsNum"] = [
                                        filials_num, True
                                    ]
                                except:
                                    pass

                                cmd = sf.build_insert_expression(
                                    data_dict, "output_categories")
                                sf.execute_query(conn, cmd, 3)

                                try:
                                    css_arr = [
                                        "a.link.frame__controlsButton._back._undashed",
                                        "a.link.frame__controlsButton._close._undashed"
                                    ]
                                    for css in css_arr:
                                        if click_closing_button(driver, css):
                                            break
                                    else:
                                        print(
                                            'Did not find closing button for:'
                                            + driver.current_url)
                                except:
                                    pass

                        else:  # got a single card
                            data_dict, back_link = read_single_card(driver)
                            if len(data_dict) == 0:
                                print(
                                    "unable to read data: city = {}, category = {}, link = {}"
                                    .format(cr_city, cat_name,
                                            driver.current_url))
                            else:
                                data_dict["catName"] = [cat_name, True]
                                data_dict["cityName"] = [cr_city, True]
                                cmd = sf.build_insert_expression(
                                    data_dict, "output_categories")
                                sf.execute_query(conn, cmd, 3)

                            if back_link.location_once_scrolled_into_view[
                                    'x'] > 0 and back_link.location_once_scrolled_into_view[
                                        'y'] > 0:
                                try:
                                    back_link.click()
                                    time.sleep(2)
                                    flg_reload_categories = True
                                except:
                                    pass
                                break

                        # load next page
                        try:
                            next_link_disabled = driver.find_element_by_css_selector(
                                "div.pagination__arrow._right._disabled")
                            click_element(
                                driver,
                                "a.link.searchBar__mediaButton.searchBar__mediaClose._undashed",
                                False)
                            flg_reload_categories = True
                            break  # a disabled next arrow found - the end of the list
                        except:
                            next_link = driver.find_element_by_css_selector(
                                "div.pagination__arrow._right")
                            next_link.click()
                            time.sleep(2)
                        finally:
                            flg_reload_categories = True
                            break
                    sf.execute_query(
                        conn,
                        "INSERT INTO checkedData (obj, city) VALUES ('{}', '{}')"
                        .format(cat_name, cr_city))
    driver.close()