Esempio n. 1
0
class ChromeTestCase(unittest.TestCase):
    def setUp(self):
        self.s = Session(
            'chromedriver',
            browser='chrome',
            default_timeout=15,
            webdriver_options={'arguments': ['headless', 'disable-gpu']})

    def test_cookie_transfer_to_requests(self):
        """Tested on http://testing-ground.scraping.pro/login"""

        self.s.driver.get('http://testing-ground.scraping.pro/login')
        self.s.driver.find_element_by_id('usr').send_keys('admin')
        self.s.driver.ensure_element_by_id('pwd').send_keys(
            '12345', Keys.ENTER)
        self.s.driver.ensure_element_by_xpath(
            '//div[@id="case_login"]/h3[@class="success"]')

        self.s.transfer_driver_cookies_to_session()
        response = self.s.get(
            'http://testing-ground.scraping.pro/login?mode=welcome')
        success_message = response.xpath(
            '//div[@id="case_login"]/h3[@class="success"]/text()'
        ).extract_first()

        self.assertEqual(
            success_message, 'WELCOME :)',
            'Failed to transfer cookies from Selenium to Requests')

    def test_cookie_transfer_to_selenium(self):
        self.s.get('http://testing-ground.scraping.pro/login')
        self.s.cookies.set('tdsess',
                           'TEST_DRIVE_SESSION',
                           domain='testing-ground.scraping.pro')

        self.s.transfer_session_cookies_to_driver()
        self.s.driver.get(
            'http://testing-ground.scraping.pro/login?mode=welcome')
        success_message = self.s.driver.xpath(
            '//div[@id="case_login"]/h3[@class="success"]/text()'
        ).extract_first()

        self.assertEqual(
            success_message, 'WELCOME :)',
            'Failed to transfer cookies from Requests to Selenium')

    def tearDown(self):
        self.s.driver.close()
def main():
    try:
        start = sys.argv[1]
    except:
        print('ERROR: Requires URL as the first argument.')
        quit(0)

    # Constants
    ALLDROPDOWN = '//*[@id="selectReadType"]/option[2]'
    ACTUALIMAGES = '//*[@id="divImage"]//img'
    IMGGROUPS = '.listing a'
    TITLE = '.bigChar'
    NEXT = '//*[(@id = "btnNext")]//src'

    s = Session(
        webdriver_path='C:\\Webdrivers\\chromedriver', browser='chrome'
    )  # ,webdriver_options={'arguments': ['headless', 'disable-gpu']}

    s.driver.get(start)
    s.driver.ensure_element_by_css_selector(TITLE)
    title = s.driver.find_element_by_css_selector(TITLE).text
    groups = s.driver.find_elements_by_css_selector(IMGGROUPS)
    s.transfer_driver_cookies_to_session()
    begin = to_attribute_list(groups, 'href').pop()
    response = s.get(begin).xpath(ACTUALIMAGES)
    print(response)
    s.close()
    quit(2)
Esempio n. 3
0
class ChromeTestCase(unittest.TestCase):
    def setUp(self):
        self.s = Session('chromedriver',
                         browser='chrome',
                         default_timeout=15,
                         webdriver_options={'arguments': ['headless', 'disable-gpu']})

    def test_cookie_transfer_to_requests(self):
        """Tested on http://testing-ground.scraping.pro/login"""

        self.s.driver.get('http://testing-ground.scraping.pro/login')
        self.s.driver.find_element_by_id('usr').send_keys('admin')
        self.s.driver.ensure_element_by_id('pwd').send_keys('12345', Keys.ENTER)
        self.s.driver.ensure_element_by_xpath('//div[@id="case_login"]/h3[@class="success"]')

        self.s.transfer_driver_cookies_to_session()
        response = self.s.get('http://testing-ground.scraping.pro/login?mode=welcome')
        success_message = response.xpath(
            '//div[@id="case_login"]/h3[@class="success"]/text()').extract_first()

        self.assertEqual(
            success_message, 'WELCOME :)', 'Failed to transfer cookies from Selenium to Requests')

    def test_cookie_transfer_to_selenium(self):
        self.s.get('http://testing-ground.scraping.pro/login')
        self.s.cookies.set('tdsess', 'TEST_DRIVE_SESSION', domain='testing-ground.scraping.pro')

        self.s.transfer_session_cookies_to_driver()
        self.s.driver.get('http://testing-ground.scraping.pro/login?mode=welcome')
        success_message = self.s.driver.xpath(
            '//div[@id="case_login"]/h3[@class="success"]/text()').extract_first()

        self.assertEqual(
            success_message, 'WELCOME :)', 'Failed to transfer cookies from Requests to Selenium')

    def tearDown(self):
        self.s.driver.close()
Esempio n. 4
0
def main(url):
    session = Session(
        webdriver_path='../Chrome Canary/chromedriver.exe',
        browser='chrome',
        default_timeout=6,
        webdriver_options={'arguments': ['disable-logging', 'headless']})

    session.driver.get(url)
    div_content = WebDriverWait(session.driver, 5).until(
        EC.presence_of_element_located((By.XPATH, "//div[@id='content']")))
    print('######## FROM SELENIUM ########')
    print(div_content.text)

    print('######## COPYING SESSION FROM SELENIUM TO REQUESTS ########')
    session.transfer_driver_cookies_to_session()
    final_response = session.get(url,
                                 headers={'user-agent': 'custom requestium'})

    soup = BeautifulSoup(final_response.text, 'html.parser')
    print('######## FROM REQUESTS ########')
    body_text = soup.find(id="content")
    print(body_text.text)
Esempio n. 5
0
class RugratsBot:
    def __init__(self, userLogin: str, userPass: str) -> None:
        self._rugratSession = Session("./chromedriver",
                                      browser="chrome",
                                      default_timeout=15)
        self._userLogin = userLogin
        self._userPassword = userPass
        self._isLogged = False

        # default/recomended range seconds between
        self._rangeTimeBetComments = 290
        self._rangeTimeBetFollow = 400

    def setLoginInfo(self, userLogin: str, userPass: str) -> None:
        self._userLogin = userLogin
        self._userPassword = userPass

    def setInstagramPageUrl(self, instaPageUrl: str) -> None:
        self._instagramPageUrl = instaPageUrl

    def setListOfComments(self, listOfComments: List) -> None:
        self._listOfComments = listOfComments

    def setTimeBetComments(self, timeBetweenComments: int) -> None:
        self._rangeTimeBetComments = timeBetweenComments

    def isInternetOn(self) -> bool:
        url = "https://duckduckgo.com/"
        timeout = 5
        try:
            _ = self._rugratSession.get(url, timeout=timeout)
            return True
        except ConnectionError:
            print("No connection available")
        return False

    def login(self, saveLoginInformatin: bool = True) -> None:
        if self._userLogin == "" or self._userPassword == "":
            return

        # Sign in on instagram **outset**
        self._rugratSession.driver.get(
            "https://www.instagram.com/accounts/login/?hl=pt-br")
        sleep(5)

        self._rugratSession.driver.ensure_element_by_css_selector(
            "input[name='username']").send_keys(self._userLogin)
        self._rugratSession.driver.ensure_element_by_css_selector(
            "input[name='password']").send_keys(self._userPassword)

        sleep(5)
        self._rugratSession.driver.ensure_element_by_xpath(
            "/html/body/div[1]/section/main/div/article/div/div[1]/div/form/div/div[3]/button/div"
        ).click()
        # Sign in on instagram **end**

        if saveLoginInformatin == True:
            # Save login information on Chromium driver **outset**
            sleep(5)
            self._rugratSession.driver.ensure_element_by_xpath(
                "/html/body/div[1]/section/main/div/div/div/section/div/button"
            ).click()

            sleep(5)
            self._rugratSession.driver.ensure_element_by_xpath(
                "/html/body/div[4]/div/div/div/div[3]/button[1]").click()
            # Save login information on Chromium driver **end**

        self._isLogged = True

    def logout(self) -> None:
        if self._isLogged:
            self._rugratSession.driver.get("https://www.instagram.com")
            self._rugratSession.driver.ensure_element_by_xpath(
                "/html/body/div[1]/section/nav/div[2]/div/div/div[3]/div/div[5]/span/img"
            ).click()
            self._rugratSession.driver.ensure_element_by_xpath(
                "/html/body/div[1]/section/nav/div[2]/div/div/div[3]/div/div[5]/div[2]/div/div[2]/div[2]/div/div/div/div/div/div/div"
            ).click()
            sleep(5)

    def followProfiles(self, targetUser: str) -> None:
        if self._isLogged == False:
            raise Exception(
                "First, yout should be logged in. Before start to follow, run 'yourBabyRugrat.signIn()'"
            )

        self._rugratSession.driver.get("https://www.instagram.com/" +
                                       targetUser)
        self._rugratSession.driver.ensure_element_by_xpath(
            "/html/body/div[1]/section/main/div/header/section/ul/li[3]/a"
        ).click()

        self._rugratSession.transfer_driver_cookies_to_session

        numberOfFollowers = int(
            self.getNumberOfFollowers(targetUser).replace(",", ""))

        followersContainerScroll = self._rugratSession.driver.ensure_element_by_xpath(
            "//div[@class='isgrP']")

        counter = 0
        while counter < int(numberOfFollowers / 7):
            self._rugratSession.driver.execute_script(
                "arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;",
                followersContainerScroll,
            )
            sleep(2)
            counter += 1

        # self._rugratSession.driver.execute_script("window.scrollIntoView();")
        for userToFollow in range(1, numberOfFollowers):
            sleep(20)
            self._rugratSession.driver.ensure_element_by_xpath(
                "/html/body/div[4]/div/div/div[2]/ul/div/li[" +
                str(userToFollow) + "]/div/div[3]/button").click()

    def getNumberOfFollowers(self, targetUser: str) -> int:
        if self._isLogged == False:
            raise Exception(
                "First, yout should be logged in. Before start commenting, run 'yourBabyRugrat.signIn()'"
            )
        profileResponse = self._rugratSession.get(
            "https://www.instagram.com/" + targetUser)
        soupResponse = BeautifulSoup(profileResponse.text, "html.parser")
        metaTags = soupResponse.find_all("meta")
        numberOfFollowers = str()
        for tag in metaTags:
            if str(tag).lower().find("followers") != -1:
                numberOfFollowers = tag

        numberOfFollowers = str(numberOfFollowers).split()
        numberOfFollowers = numberOfFollowers[3]

        return numberOfFollowers

    def commentingByScrapingStuff(self, instagramUrlToComment: str,
                                  subjectToComment: str) -> None:
        # not implemented yet
        pass

    def commentingByList(self, instagramUrlToComment: url,
                         listOfComments: List) -> None:
        if self._isLogged == False:
            raise Exception(
                "First, yout should be logged in. Before start commenting, run 'yourBabyRugrat.signIn()'"
            )

        # Load target instagram page **outset**
        self._rugratSession.driver.get(instagramUrlToComment)
        # Load target instagram page **end**

        # start commenting
        while True:
            maxTimeToComment = self._rangeTimeBetComments + 100
            try:
                index = randrange(0, len(listOfComments))
                sleepTime = randrange(self._rangeTimeBetComments,
                                      maxTimeToComment)
                commentArea = self._rugratSession.driver.ensure_element_by_xpath(
                    "/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[3]/div[1]/form/textarea"
                )
                commentArea.click()
                commentArea = self._rugratSession.driver.ensure_element_by_xpath(
                    "/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[3]/div[1]/form/textarea"
                )

                if self.isInternetOn() == False:
                    continue

                commentArea.send_keys(listOfComments[index])
                commentArea.submit()
                sleep(sleepTime)

            except KeyboardInterrupt as interrupted:
                try:
                    print(interrupted)
                    sys.exit(0)
                except SystemExit:
                    os._exit(0)
class Downloader():
    def __init__(self,
                 username,
                 password,
                 driver_path=None,
                 download_path=None,
                 headless=True,
                 logger=None):
        if not logger:
            logging.basicConfig(level=logging.DEBUG)
            self.logger = logging.getLogger(__name__)
            self.logger.setLevel('DEBUG')
        else:
            self.logger = logger
        self._username = username
        self._password = password
        self.driver_path = driver_path
        self.download_path = download_path
        self.logger = logging.getLogger('odigo_downloader.downloader')
        self.url = 'https://enregistreur.prosodie.com/odigo4isRecorder/EntryPoint?serviceName=LoginHandler'
        self.headless = headless
        self.validated = False
        self.active = False

    def __str__(self):
        return f"\nDOWNLOAD PATH: {self.download_path}\nOPTIONS: {self.webdriver_options}\n" \
            f"DRIVER PATH: {self.driver_path}\nUSERNAME: {self._username}\nURL: {self.url}"

    def setup_selenium_browser(self):
        if self.active:
            return f"Session/Browser already active. Cannot have two concurrent sessions/browsers"
        options = webdriver.ChromeOptions()
        prefs = {
            'download.default_directory': self.download_path,
            'download.prompt_for_download': False,
            'download.directory_upgrade': True,
            'safebrowsing.enabled': False,
            'safebrowsing.disable_download_protection': True
        }
        options.add_experimental_option('prefs', prefs)

        if self.headless:
            options.add_argument('--headless')

        self.browser = webdriver.Chrome(self.driver_path, options=options)

        if self.headless:
            self.browser.command_executor._commands["send_command"] = (
                "POST", '/session/$sessionId/chromium/send_command')
            params = {
                'cmd': 'Page.setDownloadBehavior',
                'params': {
                    'behavior': 'allow',
                    'downloadPath': self.download_path
                }
            }
            command_result = self.browser.execute("send_command", params)
            for key in command_result:
                self.logger.debug("result:" + key + ":" +
                                  str(command_result[key]))

        self.active = True

    def setup_requestium_session(self):
        if self.active:
            return f"Session/Browser already active. Cannot have two concurrent sessions/browsers"
        if self.headless:
            webdriver_options = {'arguments': ['headless']}
        else:
            webdriver_options = {}
        self.logger.debug(
            f"Creating Session object with values: {webdriver_options}")
        self.session = Session(webdriver_path=self.driver_path,
                               browser='chrome',
                               default_timeout=15,
                               webdriver_options=webdriver_options)
        self.active = True

    def login_requestium(self):
        if self.active:
            raise CustomException(f"Cannot have two active sessions/browsers")
        self.setup_requestium_session()
        self.logger.debug(f"Going to URL: {self.url}")
        self.session.driver.get(self.url)
        self.logger.debug(f"Entering credentials")
        self.session.driver.ensure_element_by_name('mail').send_keys(
            self._username)
        self.session.driver.ensure_element_by_name('password').send_keys(
            self._password)
        self.session.driver.ensure_element_by_name('valider').click()
        self.validated = True

    def login_selenium(self):
        if self.active:
            raise CustomException(f"Cannot have two active sessions/browsers")
        self.setup_selenium_browser()
        self.browser.get(self.url)
        self.browser.find_element_by_name('mail').send_keys(username)
        self.browser.find_element_by_name('password').send_keys(password)
        self.browser.find_element_by_name('valider').click()
        return

    def download_mp3(self, path=None, ref=None, xpath=None):
        self.logger.info(
            f"\ndownload_mp3 called with:\nPATH: {path},\nREF: {ref},\nXPATH: {xpath}"
        )
        if ref is not None and xpath is None:
            self.session.driver.ensure_element_by_class_name(
                'x-action-col-icon').click()
        elif xpath is not None and ref is None:
            self.session.driver.ensure_element_by_xpath(xpath).click()
        else:
            self.logger.error("Cannot use both reference number and xpath")
            return
        self.session.driver.switch_to.frame('result_frame')
        time.sleep(1)
        # Get URL of mp3 file
        src = self.session.driver.ensure_element_by_id(
            'messagePlayer').get_attribute('src')
        # Selenium --> Requests
        self.session.transfer_driver_cookies_to_session()
        # Download
        r = self.session.get(src, stream=True)
        if path is None:
            if ref is None:
                # Get ref number
                soap = BeautifulSoup(self.session.driver.page_source, 'lxml')
                ref = soap.findAll('div', class_='x-grid-cell-inner')[1].text
            path = '%s.mp3' % ref
        if r.status_code == 200:
            with open(path, 'wb') as f:
                for chunk in r.iter_content(1024 * 2014):
                    f.write(chunk)
        else:
            return 1
        # Requests --> Selenium
        self.session.transfer_session_cookies_to_driver()
        self.session.driver.switch_to.default_content()
        return

    def download_mp3_by_ref(self, ref, path=None):
        self.login_requestium()
        self.search_by_ref(ref)
        result = self.download_mp3(path, ref)
        if result == 1:
            return 1
        self.session.driver.close()

    def download_mp3_by_csv(self, csv_path, download_dir=None):
        if download_dir is None:
            download_dir = self.download_path
        self.login_requestium()
        refs = pd.read_csv(csv_path, sep=';').Name
        length = len(refs)
        for i, ref in enumerate(refs):
            sys.stdout.write('\r')
            sys.stdout.write('downloading: %s/%s' % (i + 1, length))
            sys.stdout.flush()
            self.search_by_ref(ref)
            mp3_path = None
            if download_dir is not None:
                file_name = '%s.mp3' % ref
                mp3_path = os.path.join(download_dir, file_name)
            result = self.download_mp3(path=mp3_path, ref=ref)
            if result == 1:
                return 1
        sys.stdout.write('\n')
        sys.stdout.flush()
        self.session.driver.close()
        return "Finished"

    def search_by_ref(self, ref):
        self.session.driver.get(self.url)
        self.session.driver.ensure_element_by_name('refEr').send_keys(ref)
        self.session.driver.ensure_element_by_id('button-1009').click()

    def change_date_format(self, date):
        try:
            correct_string = date.strptime(str(date.date()),
                                           '%Y-%m-%d').strftime('%d-%m-%Y')
            return correct_string
        except Exception as e:
            raise e

    def change_time_format(self, date):
        try:
            correct_string = date.strptime(
                str(date.hour) + ':' + str(date.minute),
                "%H:%M").strftime("%I:%M %p")
            if correct_string[0] == "0":
                return correct_string[1::]
            else:
                return correct_string
        except Exception as e:
            raise e

    def ceil_dt(self, dt, delta):
        """Round up to the nearest half hour"""
        return dt + (datetime.datetime.min - dt) % delta

    def set_range(self, now):
        """
        Takes current datetime and finds the nearest, previous half hour.
        Returns the appropriate start and end times and date
        """
        # Format: '10-19-2018'
        # Format: '12:00 AM'
        hour_ago = now - datetime.timedelta(minutes=60)
        rounded = self.ceil_dt(hour_ago, datetime.timedelta(minutes=30))

        start_date = self.change_date_format(rounded)
        start_time = self.change_time_format(rounded)
        thirty_mins = datetime.timedelta(minutes=30)
        end_date = start_date
        end_time = self.change_time_format(rounded + thirty_mins)
        return (start_date, start_time, end_date, end_time)

    def search_by_range(self, start_date, start_time, end_date, end_time):
        """ Doesn't work correctly. Date seems to work but time not so much.

        Search records on www.prosodie.com by date range and return session.
        Input:
            s -- Requestium session (required |
                type: requestium.requestium.Session);
            start_date -- start date (not required | type: str). Format:
                        'mm:dd:yyyy'. Example: '03-05-1991';
            start_time -- start time (not required | type: str). Example:
                        '12:00 AM';
            end_date -- end date (not required | type: str). Format:
                        'mm:dd:yyyy'. Example: '03-05-1991';
            end_time -- end time (not required | type: str). Example: '12:00 PM'.
        Output:
            s -- Requestium session (type: requestium.requestium.Session).

        """
        if start_date:
            self.browser.find_element_by_name('dateDebut').send_keys(
                start_date)
        if start_time:
            self.browser.find_element_by_name('heureDebut').send_keys(
                start_time)
        if end_date:
            self.browser.find_element_by_name('dateFin').send_keys(end_date)
        if end_time:
            self.browser.find_element_by_name('heureFin').send_keys(end_time)
        self.browser.find_element_by_id('button-1009').click()
        return

    def download_all_half_hour(self):
        self.logger.debug(f"Downloading calls from last half hour")
        self.logger.debug(f"Login check...")
        if not self.validated:
            self.logger.debug(f"Not logged in. Validating")
            self.login_selenium()
        self.logger.debug(f"Logged in.")
        self.logger.debug(f"Getting search range")
        search_range = self.set_range(datetime.datetime.now())
        sleep(2)
        self.logger.debug(f"Applying filters")
        self.browser.find_element_by_id("criteres-inputEl").send_keys('_EN')
        self.search_by_range(*search_range)
        sleep(5)
        self.logger.debug(f"Downloading results to {self.download_path}")
        csvB = self.browser.find_element_by_id("csvButton")
        csvB.click()
        self.browser.find_element_by_id("button-1006").click()
        self.browser.switch_to.window(self.browser.window_handles[1])
        sleep(5)
        self.logger.debug(f"Ending session")
Esempio n. 7
0
print('Waiting for elements to load...')
s.driver.ensure_element_by_class_name(
    "desktop-onboarding-sign-up__form-toggler", state='visible').click()

if reddit_user_name:
    s.driver.ensure_element_by_id('user_login').send_keys(reddit_user_name)
    s.driver.ensure_element_by_id('passwd_login').send_keys(Keys.BACKSPACE)
print('Please log-in in the chrome browser')

s.driver.ensure_element_by_class_name("desktop-onboarding__title",
                                      timeout=60,
                                      state='invisible')
print('Thanks!')

if not reddit_user_name:
    reddit_user_name = s.driver.xpath(
        "//span[@class='user']//text()").extract_first()

if reddit_user_name:
    s.transfer_driver_cookies_to_session()
    response = s.get(
        "https://www.reddit.com/user/{}/".format(reddit_user_name))
    cmnt_karma = response.xpath(
        "//span[@class='karma comment-karma']//text()").extract_first()
    reddit_golds_given = response.re_first(r"(\d+) gildings given out")
    print("Comment karma: {}".format(cmnt_karma))
    print("Reddit golds given: {}".format(reddit_golds_given))
else:
    print("Couldn't get user name")
Esempio n. 8
0
class HTMLParser(object):
    """
    Assign parsing task into it.

    It will mantain a queue and parse website in multithread with random switch proxy.

    Make headless an optional?!
    """
    def __init__(
        self,
        mode: str = 'requestium',
        use_cache: bool = True,
        max_cache_size: int = 10000,
        timeout: int = 15,
        browser: str = 'chrome',
        loading_time: int = 3,  # delay to wait the webpage loading
        webdriver_path: str = os.path.join(curr_dir, 'chromedriver')):
        assert mode in ['requests', 'selenium', 'requestium']
        assert browser in ['chrome']

        self.mode = mode
        self.loading_time = loading_time
        self.timeout = timeout
        self.use_cache = use_cache
        if use_cache:
            self.html_cache = LRUCache(maxsize=max_cache_size)

        if mode == 'requests':
            pass
        elif mode == 'selenium':
            from selenium import webdriver
            from selenium.webdriver.chrome.options import Options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            self.driver = webdriver.Chrome(webdriver_path,
                                           chrome_options=chrome_options)
        elif mode == 'requestium':
            from requestium import Session, Keys
            self.session = Session(
                webdriver_path=webdriver_path,
                browser='chrome',
                default_timeout=timeout,
                webdriver_options={'arguments': ['headless']})
        else:
            assert False, '"mode" must be either requests, selenium, or requestium.'

    def _get_html(self,
                  url: str,
                  use_driver: bool = False,
                  check_status: bool = False) -> str:
        """
        TODO: Add asynchronous queue

        use_driver only used for requestium
        TODO: check_status is used for, when using "driver", we don't know the html status code
        https://stackoverflow.com/questions/5799228/how-to-get-status-code-by-using-selenium-py-python-code
        """
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }

        if self.mode == 'requests':
            raw_html = requests.get(url, headers=headers)
            if raw_html.status_code == 200:
                return raw_html.text

        elif self.mode == 'selenium':
            self.driver.get(url)
            # give some time for driver to load webpabe
            time.sleep(self.loading_time)
            return self.driver.page_source

        elif self.mode == 'requestium':
            if use_driver:
                self.session.driver.get(url)
                # give some time for driver to load webpabe
                time.sleep(self.loading_time)
                return self.session.driver.page_source
            else:
                raw_html = self.session.get(url, headers=headers)
                if raw_html.status_code == 200:
                    return raw_html.text

    def get_html_directly(self,
                          url: str,
                          use_driver: bool = False,
                          check_status: bool = False) -> str:
        """
        Cache wrapper

        TODO: auto fix url schema (i.e. add http or https)
        (requests.exceptions.MissingSchem)
        """
        if self.use_cache:
            if url not in self.html_cache:
                html = self._get_html(url, use_driver, check_status)

                if not html:
                    return None

                self.html_cache[url] = html
            return self.html_cache[url]
        else:
            return self._get_html(url, use_driver, check_status)
Esempio n. 9
0
class API:
    def __init__(self, path):
        self.last_json = ""
        self.last_response = None
        self.IG_SIG_KEY = '4f8732eb9ba7d1c8e8897a75d6474d4eb3f5279137431b2aafb71fafe2abe178'
        self.SIG_KEY_VERSION = '4'
        self.USER_AGENT = 'Instagram 10.26.0 Android ({android_version}/{android_release}; 640dpi; 1440x2560; {manufacturer}; {device}; {model}; samsungexynos8890; en_US)'.format(
            **DEVICE_SETTINTS)
        self.s = Session(webdriver_path=path,
                         browser='chrome',
                         default_timeout=15)
        self.logger = logging.getLogger('[instatesi_{}]'.format(id(self)))
        self.privateUsers = {}
        self.users = {}
        fh = logging.FileHandler(filename='instatesi.log')
        fh.setLevel(logging.INFO)
        fh.setFormatter(logging.Formatter('%(asctime)s %(message)s'))

        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(
            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

        self.logger.addHandler(fh)
        self.logger.addHandler(ch)
        self.logger.setLevel(logging.DEBUG)
        self.lastUserHandled = None

    def saveScrapedFollowers(self):
        import json
        self.logger.info("Except for the selected Followers...")
        if not os.path.exists(os.getcwd() + "/ScrapedFollowers/" +
                              self.lastUserHandled + ".txt"):
            with open(
                    os.getcwd() + "/ScrapedFollowers/" + self.lastUserHandled +
                    ".txt", "w") as f:
                """ f.write("Scraped following from " + self.lastUserHandled +"\n")
                f.write("-------Non private users-------\n")
                f.write(json.dumps(self.users[''], indent=2))
                f.write("\n-------Private users-------\n")
                f.write(json.dumps(self.privateUsers, indent=2)) """
                for k, v in self.users.items():
                    f.write(k + ',\n')
            self.logger.info("Following successfully saved!")
            self.users = dict()
            self.privateUsers = dict()

        else:
            self.logger.warning(
                "Warning! The user is already present in the database. Overwrite?"
            )
            #Define some logic for file overwriting

    def saveScrapedFollowing(self):
        import json
        self.logger.info("Except for the Following's Following...")
        if not os.path.exists(os.getcwd() + "/ScrapedFollowing/" +
                              self.lastUserHandled + ".txt"):
            with open(
                    os.getcwd() + "/ScrapedFollowing/" + self.lastUserHandled +
                    ".txt", "w") as f:
                f.write("Scraped following from " + self.lastUserHandled +
                        "\n")
                f.write("-------Non private users-------\n")
                f.write(json.dumps(self.users, indent=2))
                f.write("\n-------Private users-------\n")
                f.write(json.dumps(self.privateUsers, indent=2))
            self.logger.info("Following successfully saved!")
            self.users = dict()
            self.privateUsers = dict()
        else:
            self.logger.warning(
                "Warning! The user is already present in the database. Overwrite?"
            )
            #Define some logic for file overwriting

    def getUserFollowers(self, userID, rank_token, selection="followers"):
        self.logger.info("User ID follower scraping started " + str(userID))
        followers = self.getTotalFollowers(userID,
                                           rank_token,
                                           fromInput=selection)
        return [str(item['username'])
                for item in followers][::-1] if followers else []

    def __getUsernameInfo(self, usernameId):
        return self.__send_request('users/' + str(usernameId) + '/info/')

    def __send_request_for_user_followers(self,
                                          user_id,
                                          rank_token,
                                          max_id='',
                                          selection="followers"):
        url = 'friendships/{user_id}/followers/?rank_token={rank_token}' if selection == "followers" else 'friendships/{user_id}/following/?max_id={max_id}&ig_sig_key_version={sig_key}&rank_token={rank_token}'
        url = url.format(
            user_id=user_id,
            rank_token=rank_token) if selection == "followers" else url.format(
                user_id=user_id,
                max_id=max_id,
                sig_key=self.SIG_KEY_VERSION,
                rank_token=rank_token)
        if max_id:
            url += '&max_id={max_id}'.format(max_id=max_id)
        return self.__send_request(url)

    def searchUsername(self, username):
        url = 'users/{username}/usernameinfo/'.format(username=username)
        self.logger.info("Looking for user information " + username)
        return self.__send_request(url)

    def getUsernameFromID(self, user_id):
        url = 'users/{user_id}/info/'.format(user_id=user_id)
        self.__send_request(url)
        self.logger.info("Return the requested username, or " +
                         str(self.last_json['user']['username']))
        return self.last_json['user']['username']

    def __generateSignature(self, data, IG_SIG_KEY, SIG_KEY_VERSION):
        body = hmac.new(
            IG_SIG_KEY.encode('utf-8'), data.encode('utf-8'),
            hashlib.sha256).hexdigest() + '.' + urllib.parse.quote(data)
        signature = 'ig_sig_key_version={sig_key}&signed_body={body}'
        return signature.format(sig_key=SIG_KEY_VERSION, body=body)

    def castUsernameToUserID(self, usernameToLook):
        self.lastUserHandled = usernameToLook
        userID = ""
        self.searchUsername(usernameToLook)
        if "user" in self.last_json:
            userID = str(self.last_json["user"]["pk"])
        self.logger.info("The username " + usernameToLook +
                         " corresponds to the ID " + userID)
        return userID

    def seeStories(self):
        self.__send_request("feed/reels_tray/")
        return self.last_json

    def getTotalFollowers(self, usernameId, rank_token, fromInput="followers"):
        sleep_track = 0
        followers = []
        next_max_id = ''
        self.__getUsernameInfo(usernameId)
        if "user" in self.last_json:
            total_followers = self.last_json["user"][
                'follower_count'] if fromInput == "followers" else self.last_json[
                    "user"]['following_count']
            if total_followers > 200000:
                self.logger.warning(
                    "There are over 200,000 followers. It may take a while.")
        else:
            return False
        with tqdm(total=total_followers,
                  desc="Retrieving followers",
                  leave=False) as pbar:
            while True:
                self.__send_request_for_user_followers(usernameId,
                                                       rank_token,
                                                       next_max_id,
                                                       selection=fromInput)
                temp = self.last_json
                try:
                    pbar.update(len(temp["users"]))
                    for item in temp["users"]:
                        if item['is_private']:
                            self.privateUsers[item['username']] = {
                                'ID': item['pk'],
                                'user_handle': item['username'],
                                'is_verified': item['is_verified'],
                                'is_private': item['is_private'],
                                'profile pic': item['profile_pic_url'],
                                'Full Name': item['full_name']
                            }
                        else:
                            self.users[item['username']] = {
                                'ID': item['pk'],
                                'user_handle': item['username'],
                                'is_private': item['is_private'],
                                'is_verified': item['is_verified'],
                                'profile pic': item['profile_pic_url'],
                                'Full Name': item['full_name']
                            }
                        followers.append(item)
                        sleep_track += 1
                        if sleep_track >= 20000:
                            import random
                            sleep_time = random.randint(120, 180)
                            self.logger.info("Waiting for " +
                                             str(float(sleep_time / 60)) +
                                             " due to excessive demands.")
                            time.sleep(sleep_time)
                            sleep_track = 0
                    if len(temp["users"]
                           ) == 0 or len(followers) >= total_followers:
                        self.logger.info(
                            "Returning account followers in the scraping phase, ie "
                            + str(len(followers[:total_followers])))
                        return followers[:total_followers]
                except Exception:
                    self.logger.error(
                        "Returning account followers in the scraping phase, ie "
                        + str(len(followers[:total_followers])))
                    return followers[:total_followers]
                if temp["big_list"] is False:
                    self.logger.info(
                        "Returning account followers in the scraping phase, ie "
                        + str(len(followers[:total_followers])))
                    return followers[:total_followers]
                next_max_id = temp["next_max_id"]

    def __send_request(self,
                       endpoint,
                       post=None,
                       login=False,
                       with_signature=True):
        self.s.headers.update({
            'Connection': 'close',
            'Accept': '*/*',
            'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie2': '$Version=1',
            'Accept-Language': 'en-US',
            'User-Agent': self.USER_AGENT
        })
        try:
            if post is not None:  # POST
                if with_signature:
                    post = self.__generateSignature(post, self.IG_SIG_KEY,
                                                    self.SIG_KEY_VERSION)
                response = self.s.post('https://i.instagram.com/api/v1/' +
                                       endpoint,
                                       data=post)
            else:  # GET
                response = self.s.get('https://i.instagram.com/api/v1/' +
                                      endpoint)
        except Exception as e:
            self.logger.error("Exception due to endpoint " + endpoint)
            self.logger.error(e)
            return False
        if response.status_code == 200:
            self.logger.info("The request to the endpoint " + endpoint +
                             " has been successful")
            self.last_response = response
            self.last_json = json.loads(response.text)
            return True
        else:
            return False
Esempio n. 10
0
class Github:
    def __init__(self, proxy=None):
        self.cookies = None
        self.sess = Session("/usr/local/phantomjs",
                            "phantomjs",
                            default_timeout=15)
        if proxy:
            self.sess.proxies['https'] = proxy
            self.sess.proxies['http'] = proxy
        self.proxy = proxy
        self.sess = Session(webdriver_path='/usr/local/bin/chromedriver',
                            browser='phantomjs',
                            default_timeout=15,
                            webdriver_options={'arguments': ['headless']})
        if proxy:
            self.proxies['http'] = proxy
            self.proxies['https'] = proxy
        self.user = None

    def save_session(self, name, password, cookie):
        gprint("save cred and session")
        with open(GITHUB_LOGIN, "wb") as fp:
            u = {"user": name, "pass": password}
            pickle.dump(u, fp)

        with open(GITHUB_SESSION, 'wb') as fp:
            pickle.dump(cookie, fp)

    def load_session(self):
        gprint("load seesion form github")
        if os.path.exists(GITHUB_SESSION):
            with open(GITHUB_SESSION, 'rb') as fp:
                self.cookies = pickle.load(fp)
                self.sess.cookies.update(self.cookies)
                self.sess.get("https://github.com")
                self.sess.transfer_session_cookies_to_driver()

            with open(GITHUB_LOGIN, 'rb') as fp:
                u = pickle.load(fp)
                self.user = u['user']

        elif os.path.exists(GITHUB_LOGIN):
            with open(GITHUB_LOGIN, 'rb') as fp:
                u = pickle.load(fp)
                self.login(name=u['user'], password=u['pass'])
        else:
            name = input('Github name:')
            passwd = getpass.getpass("Github pass:"******"https://github.com/login")
        self.sess.driver.find_element_by_css_selector(
            "input[name=login]").send_keys(name)
        self.sess.driver.find_element_by_css_selector(
            "input[name=password]").send_keys(password)
        self.sess.driver.find_element_by_css_selector(
            "input[name=commit]").click()

        self.sess.transfer_driver_cookies_to_session()
        self.cookies = self.sess.cookies.get_dict()
        gprint(str(self.cookies))
        self.save_session(name, password, self.cookies)

    def weak_search(self, key):
        self.load_session()
        self.search(key, "smtp")
        self.search(key, "ssh")
        # with ThreadPoolExecutor(max_workers=10) as exe:
#
# for k in ['smtp', 'ssh', 'email']:
#
# s1 = exe.submit(self.search,key, k)
# s1.add_done_callback(print)

    def search(self, *key):
        gprint(key[-1])
        if not self.cookies:
            self.load_session()

        res = requests.get("https://github.com/{}/product".format(self.user))
        self.cookies = res.cookies.get_dict()
        gprint(str(self.cookies))
        url = "https://github.com/search?q={}&type=code".format("+".join(key))
        self.sess.driver.get(url)
        res = self.sess.driver.page_source
        b = BeautifulSoup(res, 'lxml')

        codes = b.select(".code-list-item")
        if len(codes) > 0:
            gprint("Found : %d" % len(codes))
        else:
            gprint("Not found:")
            rprint(b.text.replace("\n", ""))
            # for i in b.select("a"):
            # gprint(str(i))
        ss = {}
        for code in codes:

            k = code.select(".text-bold")[0].text
            v = {
                colored(str(n), 'green'): i.text.replace("\n", "")
                for n, i in enumerate(code.select("td.blob-code"))
            }
            gprint(colored(k, "blue"))
            Tprint(v)
Esempio n. 11
0
class Monster():
	api_throttle_secs = 3

	def __init__( self ):
		self.verbose = False
		self._session = Session(
				webdriver_path=''
				,browser='chrome'
				,default_timeout=15
				,webdriver_options={
						'arguments' : [ 'headless' ]
					}
			)

	@sleep_and_retry
	@limits( calls=1, period=api_throttle_secs )
	def apply( self, job_link ):
		'''Apply to the job at the given job link for Monster.com.

		Args:
			job_link (str_or_SearchResult): the speed apply link for the job to apply to.

		Returns:
			bool: True if successful, False otherwise.
		'''
		if isinstance( job_link, SearchResult ):
			job_link = job_link.ApplyLink
		apply_result = self._session.get( job_link )
		if apply_result.status_code == 200:
			if apply_result.json()['success'] == True:
				return True
			elif self.verbose:
				print( job_link )
				print( apply_result.json() )
		return False

	def batchApply( self, job_links ):
		''' Apply to all jobs in the list of job links given
		
		Args:
			job_links (list_or_generator): List, tuple, or generator of job links
		
		Returns:
			jobs_applied_to (int): The number of jobs applied to successfully
		'''
		jobs_quantity = 0
		quantity_applied_to = 0
		if not isinstance( job_links, types.GeneratorType ):
			jobs_quantity = len( job_links )
		progress_bar = tqdm(
			total=jobs_quantity
			,desc='Applying'
			,unit='Jobs' 
		)
		for job_link in job_links:
			if isinstance( job_links, types.GeneratorType ):
				progress_bar.total += 1
			if self.apply( job_link ):
				progress_bar.update( 1 )
		jobs_applied_to = progress_bar.n
		return jobs_applied_to

	@sleep_and_retry
	@limits( calls=1, period=api_throttle_secs )
	def login( self, email, password ):
		'''Login to the Monster.com job board site.

		Args:
			email (str): Email address for logging into Monster.com.
			password (str): Password corresponding to email address to
				login to Monster.com job board site.

		Returns:
			bool: True if successful, False otherwise.
		'''

		# GOTO LOGIN PAGE TO CHECK IF AVAILABLE & GET COOKIES
		login_page = self._session.get( SITE['login'] )
		if login_page.status_code != 200:
			raise Exception( 'ERROR: COULD NOT GET LOGIN PAGE FOR MONSTER.COM : ' + SITE['login'] )

		# BUILD FORM DATA
		login_data = {
			'AreCookiesEnabled'			:	True
			,'EmailAddress'				: 	email
			,'IsComingFromProtectedView':	False
			,'IsKeepMeLoggedInEnabled'	:	True
			,'Password'					:	password
			,'PersistLogin'				:	True
		}
		request_verification_token = \
			login_page.xpath('//input[@name="__RequestVerificationToken"]/@value').extract()[0]
		login_data.update( { '__RequestVerificationToken' : request_verification_token } )

		# LOGIN
		login_result = self._session.post( SITE['login'], data=login_data )
		if login_result.status_code == 200:
			return True
		else:
			return False

	@sleep_and_retry
	@limits( calls=1, period=api_throttle_secs )
	def getJobDetails( self, job_link ):
		''' Get dictionary of details of the job, such as title and description.

		Args:
			job_link (str or int): Either a url containing the job id in the format
				of jobid={}, such as the apply link or the job page link. Or, directly
				supply the job id if it is available.

		Returns:
			job_dict (dict): Dictionary of the job link, job title, company name,
				job address, and job description.
		'''
		job_link = str( job_link )
		if not 'jobid' in job_link:
			job_id = job_link
		else:
			job_id = parse.parse_qs( parse.urlparse( job_link ).query )['jobid'][0]
		job_url = SITE[ 'job' ].format( job_id )
		job_page = self._session.get( job_url )
		job_json = job_page.json()
		job_description = job_json[ 'jobDescription' ]
		job_title = job_json[ 'companyInfo' ][ 'companyHeader' ]
		company_name = job_json[ 'companyInfo' ][ 'name' ]
		job_address = job_json[ 'companyInfo' ][ 'jobLocation' ]
		job_dict = {
			'job_link'          :   job_link
            ,'job_title'        :   job_title
            ,'job_address'      :   job_address
            ,'company_name'     :   company_name
            ,'job_description'  :   job_description
		}
		return job_dict

	def search( self, quantity=25, filter_out_recruiting_agencies=True, **kwargs ):
		''' Search Monster.com with the given filters and yield job links.
		
		Args:
			quantity (int): The max number of results to return.
			kwargs (dict): Dictionary of filters, such as keywords, 
				type (full_time,part_time), and posteddaysago.
				
		Returns:
			SearchResult (namedtuple): generator of named tuples, each
				containing an ApplyLink and a DetailsLink. The ApplyLink,
				when followed, will apply for the job automatically. The 
				Details link will return json data about the job.
		'''
		search_url = SITE['search']['root']
		
		# HANDLE SPECIAL CASE OF JOB TYPE, WHICH MUST PRECEED QUERY
		job_type_value = ''
		if 'type' in kwargs:
			job_type = kwargs['type']
			options = SITE['search']['type']['options']
			job_type_value = options[job_type] if job_type in options else ''
			kwargs.pop( 'type' )
		search_url = search_url.format(
			type=urllib.parse.quote_plus( job_type_value )
		)
			
		# FORMAT URL WITH REMAINING FILTERS
		for search_field, search_value in kwargs.items():
			if search_field in SITE['search']:
				if isinstance( SITE['search'][search_field], dict ):
					options = SITE['search'][search_field]['options']
					if search_value in options:
						options_value = options[search_value]
						search_url += '+' + urllib.parse.quote_plus( options_value )
				else:
					search_format = SITE['search'][search_field]
					search_url += \
						'&{0}'.format(search_format.format(urllib.parse.quote_plus(search_value)))

		@sleep_and_retry
		@limits( calls=1, period=self.api_throttle_secs )
		def getPage( page ):
			paged_search_url = search_url + '&page=' + str( page )
			search_page = self._session.get( paged_search_url )
			return search_page
		
		# GET AND PROCESS RETURNED JSON
		quantity_returned = 0
		page = 1
		while quantity_returned < quantity:
			search_page = getPage( page )
			if search_page.status_code != 200:
				break
			search_json = search_page.json()
			for app_dict in search_json:
				if all( key in app_dict for key in ( 'MusangKingId', 'ApplyType' ) ):
					if app_dict['MusangKingId'] != 0 and app_dict['ApplyType'] != None:			# filter jobs that are missing data / poorly formatted
						if any( x in app_dict['ApplyType'] \
							for x in QUICK_APPLY_KEYWORDS ):							# filter to include quick apply jobs only
							if not any( x.lower() in app_dict['Company']['Name'].lower() \
								for x in RECRUITING_AGENCY_KEYWORDS ) or \
								not filter_out_recruiting_agencies:						# filter jobs from recruiting agencies
								job_id = app_dict['MusangKingId']
								apply_url = SITE['speedapply'].format( job_id )
								details_url = SITE['job'].format( job_id )
								search_result = SearchResult( apply_url, details_url )
								quantity_returned += 1
								yield search_result
					if quantity_returned >= quantity:
						break
			page += 1
Esempio n. 12
0
class EFundsInfo:
    def __init__(self):
        self.session = Session(
            webdriver_path='/usr/lib/chromium-browser/chromedriver',
            browser='chrome',
            default_timeout=15,
            webdriver_options={'arguments': ['headless']})

    def __enter__(self):
        self.session = Session(
            webdriver_path='/usr/lib/chromium-browser/chromedriver',
            browser='chrome',
            default_timeout=15,
            webdriver_options={'arguments': ['headless']})
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.session.driver.quit()

    def e_funds_plan(self):
        self.session.driver.get("https://qieman.com/longwin/index")
        plan_div = self.session.driver.ensure_element_by_xpath(
            "//section[@class='plan-asset']")
        plan_list = []
        for i, tr in enumerate(
                plan_div.find_elements_by_xpath("div//table[2]//tr")[1:],
                start=1):
            summary_list = tr.text.splitlines()
            abbreviation = summary_list[0]
            fund_name = summary_list[1][:-8]
            fund_code = summary_list[1][-7:-1]
            own_amount = re.compile("[持有](\d+)[份]").search(
                summary_list[2]).group(1)
            proportion = re.compile("[:]([-\d\.]+)").search(
                summary_list[2]).group(1)
            floating_pl = re.compile("[:]([-\d\.]+)").search(
                summary_list[3]).group(1)
            plan_list.append({
                'key': i,
                'abbreviation': abbreviation,
                'fund_name': fund_name,
                'fund_code': fund_code,
                'own_amount': own_amount,
                'proportion': proportion,
                'floating_pl': floating_pl,
            })
        df = pd.DataFrame(plan_list)
        df.key = pd.to_numeric(df.key)
        df.own_amount = pd.to_numeric(df.own_amount)
        df.proportion = pd.to_numeric(df.proportion)
        df.floating_pl = pd.to_numeric(df.floating_pl)
        return df

    def transaction_history(self, func_code):
        history = []
        today = arrow.now().format('YYYY-MM-DD')
        self.session.driver.get(
            "https://qieman.com/longwin/funds/{func_code}".format(
                func_code=func_code))
        history_div = self.session.driver.ensure_element_by_xpath(
            "//section[@class='history']")
        detail_div = self.session.driver.ensure_element_by_xpath(
            "//section[@class='details']")
        amount_div_list = detail_div.find_elements_by_xpath(
            "div//span[@class='qm-amount']")
        average_price, latest_price = amount_div_list[0].text, amount_div_list[
            1].text
        history.append({
            'key': 'a',
            'date': today,
            'price': average_price,
            'action': 'a'
        })
        history.append({
            'key': 'y',
            'date': today,
            'price': latest_price,
            'action': 'y'
        })
        for idx, td in enumerate(
                history_div.find_elements_by_xpath("table/tbody/tr")):
            deal_date = td.find_element_by_xpath(
                "td//div[@class='variety-title']").text
            deal_price = td.find_element_by_xpath(
                "td//span[@class='qm-amount']").text
            action_text = td.find_element_by_xpath(
                "td//div[@class='order-action']").text
            action = "b" if "买" in action_text else "s"
            amount = pd.to_numeric(
                re.compile("[入|出](\d+)[份]").search(action_text).group(1))
            history.extend([{
                'key':
                '{index}{action}{count}'.format(index=idx,
                                                action=action,
                                                count=i),
                "date":
                deal_date,
                "price":
                deal_price,
                "action":
                action
            } for i in range(amount)])
        # df = pd.DataFrame(history).set_index("date")
        df = pd.DataFrame(history)
        # df.index = pd.to_datetime(df.index)
        df.price = pd.to_numeric(df.price)
        return df

    def e_fund_cost(self, func_code):
        self.session.driver.get(
            "https://qieman.com/longwin/funds/{func_code}".format(
                func_code=func_code))
        detail_div = self.session.driver.ensure_element_by_xpath(
            "//section[@class='details']")
        cost = detail_div.find_element_by_xpath(
            "div//span[@class='qm-amount']").text
        return pd.to_numeric(cost)

    def fund_value_history(self, fund_code, duration='1m'):
        """
        Query fund trading history data from Sina finance
        :param duration: string
            default is '1m', means query one month history.
            OR using one of following:
                '1m' - one month history,
                '3m' - three month history,
                '6m' - six month history,
                '1y' - one year history,
                '2y' - two year history,
                '3y' - three year history.
        :param fund_code: string
            specify the code of the fund you want to query
        :return:
            DataFrame:
                date - index, trading date,
                value - fund net / annual income
                total - accumulated net value / fund million return
                change - fund net growth rate
        """
        result = []
        kv = {'1m': -1, '3m': -3, '6m': -6, '1y': -12, '2y': -24, '3y': -36}
        duration_arrow = self.get_last_trading_info(fund_code)['date'].shift(
            months=kv.get(duration, -1))
        df = ts.get_nav_history(fund_code, duration_arrow).reset_index()
        df.date = df.date.astype(str)
        return df

    def get_last_trading_date(self, fund_code):
        today = arrow.now().shift(months=-1)
        while True:
            latest_df = ts.get_nav_history(fund_code,
                                           today.format('YYYY-MM-DD'))
            if (latest_df is not None):
                return arrow.get(latest_df.index[0])
            else:
                today = today.shift(months=-1)

    def get_last_trading_info(self, fund_code):
        today = arrow.now().shift(months=-1)
        while True:
            latest_df = ts.get_nav_history(fund_code,
                                           today.format('YYYY-MM-DD'))
            if (latest_df is not None):
                return {
                    'date': arrow.get(latest_df.index[0]),
                    'price': latest_df.value[0]
                }
            else:
                today = today.shift(months=-1)

    def real_time_valuation(self, fund_code: str):
        if fund_code == '001061':
            latest_info = self.get_last_trading_info(fund_code)
            # there is no real time valuation api for 001061
            valuation_date = latest_info['date']
            real_time_value_list = [['0930', latest_info['price']],
                                    ['1500', latest_info['price']]]
        elif fund_code.startswith('16'):
            # res = self.session.get("http://qt.gtimg.cn/q=sz{func_code}".format(func_code=func_code))
            res = self.session.get(
                "http://data.gtimg.cn/flashdata/hushen/minute/sz{func_code}.js"
                .format(func_code=fund_code))
            real_time_value_list = []
            data_list = res.text.replace('\\n\\', '').splitlines()
            valuation_date = '{year}-{month}-{day}'.format(
                year='20' + data_list[1][-6:-4],
                month=data_list[1][-4:-2],
                day=data_list[1][-2:])
            for i in data_list[2:-1]:
                time, value, _ = i.split()
                real_time_value_list.append([time, value])
        else:
            res = self.session.get(
                "http://web.ifzq.gtimg.cn/fund/newfund/fundSsgz/getSsgz?app=web&symbol=jj{func_code}"
                .format(func_code=fund_code))
            json_dict = json.loads(res.text)['data']
            valuation_date = json_dict['date']
            real_time_value_list = json_dict['data']
        result = []
        for i in real_time_value_list:
            result.append({
                # 'time': '{date} {hour}:{miniute}:00'.format(date=valuation_date, hour=i[0][:2], miniute=i[0][2:]),
                'time':
                '{hour}:{miniute}'.format(hour=i[0][:2], miniute=i[0][2:]),
                'value':
                i[1],
            })

        df = pd.DataFrame(result)
        df.value = pd.to_numeric(df.value)
        return df  # f100032 = EFundsInfo()
Esempio n. 13
0
class Driver(object):
    def __init__(self):
        # 使用requestium的Session, 使用requests和Selenium, 设置为headless模式
        self.s = Session(
            webdriver_path='./chromedriver',
            browser='chrome',
            default_timeout=15,
            #webdriver_options={'arguments': ['headless']}
        )
        # self.category_mapping = None

        # path = os.path.join(os.getcwd(), FILENAME)
        # if os.path.exists(path):
        #     self.category_mapping = ujson.load(open(path))
        #     pprint(self.category_mapping)

    def close(self):
        if self.s.driver is not None:
            self.s.driver.quit()
        if self.s is not None:
            self.s.close()

    def login(self):
        """
        使用driver登录到启信宝
        """
        login_url = 'http://www.qixin.com/auth/login?return_url=%2F'
        self.s.driver.get(login_url)

        # 使用requestium中的ensure_*方法定位元素
        user_element = self.s.driver.ensure_element_by_xpath(
            LOGIN_XPATH['username'])
        for c in USERNAME:
            # 间歇输入Username和Password
            user_element.send_keys(c)
            time.sleep(random.randint(0, 2))

        password_element = self.s.driver.ensure_element_by_xpath(
            LOGIN_XPATH['password'])
        for c in PASSWORD:
            password_element.send_keys(c)
            time.sleep(random.random())
        password_element.send_keys(Keys.ENTER)
        self.s.driver.implicitly_wait(20)

    def process_cookies(self):
        """
        使用requests抓取页面
        """
        # 将driver的cookies转给requests的session
        tmp_url = 'http://www.qixin.com/search?area.province=12&page=1&scope[]=1'
        self.s.driver.get(tmp_url)
        self.s.transfer_driver_cookies_to_session()
        self.s.copy_user_agent_from_driver()

        # 判断category mapping是否存在
        if self.category_mapping is None:
            req = self.s.get('http://www.qixin.com')
            self.category_mapping = {}
            for element in req.xpath(CATEGORY_XPATH['info']):
                category_l1 = element.xpath(
                    CATEGORY_XPATH['l1']).extract_first().strip()
                category_l2 = element.xpath(CATEGORY_XPATH['l2']).extract()
                self.category_mapping[category_l1] = category_l2
                ujson.dump(self.category_mapping,
                           open(os.path.join(os.getcwd(), FILENAME), 'w'))

    def fetch_page_with_chrome(self, url):
        self.s.transfer_session_cookies_to_driver()
        self.s.driver.get(url)

    def fetch_page_with_requests(self, url):
        """
        url = 'http://www.qixin.com/search?area.province=12&page=%s&scope[]=1&sorter=4' % page
        :param url:请求的URL
        :param return: 返回list
        """
        # 获取cookies之后,使用requests的session开始抓取数据
        self.s.proxies.update({
            'http': 'http://forward.xdaili.cn:80',
            'https': 'https://forward.xdaili.cn:80'
        })
        self.s.headers.update({'Proxy-Authorization': sign()})
        req = self.s.get(url)
        result = parse_list(req)
        return result
Esempio n. 14
0
class HuPu:

    commentaries = [
        '朋友圈每日更新 各种秒价第一时间了解:clpro7',
        '回收各种球鞋 aj 喷泡 椰子 实战 急用鞋换钱 闲置清理空间 全新二手皆可 打包优先 寻求多方合作 更多精彩尽在: clpro7',
        '最大限度发挥球鞋价值 接各种套现寄卖 全新二手都可以 加微信:clpro7 秒价实时更新'
    ]
    mail = {
        'recipient': '*****@*****.**',
        'subject': 'HuPu',
        'content': '请重新登录 http://39.107.86.245:8080'
    }

    def __init__(self, comment_count=30, commentaries=None, start_at=8, end_with=23):
        self.s = Session(
            './chromedriver',
            'chrome',
            default_timeout=60,
            webdriver_options={'arguments': ['headless', 'disable-gpu', f'user-agent={user_agent}']}
        )
        self.s.headers.update(s_headers)
        self.comment_count = comment_count
        self.commentaries = commentaries
        self.start_at = start_at
        self.end_with = end_with
        self.posts = Queue()
        self.exception_recoder = []

    @ExceptionReporter
    def login(self, third_party):
        """
        third party can be vx or qq
        """
        third_parties = {'vx': 0, 'qq': 1}
        resp = self.s.get('https://passport.hupu.com/pc/login')
        qrcode_urls = resp.xpath('//div[@class="login-method"]/a/@data-href').extract()
        qrcode_url = qrcode_urls[third_parties.get(third_party)]
        if third_party == 'qq':
            qrcode_url = 'https://passport.hupu.com' + qrcode_url
        self.s.driver.get(qrcode_url)
        self.s.driver.get_screenshot_as_file('qrcode.png')
        logger.info('qrcode saved!')

    @ExceptionReporter
    def get_topic_url(self):
        """
        获取帖子主页链接
        """
        self.s.driver.get('https://www.hupu.com')
        iuid = self.s.driver.ensure_element_by_id('g_m').get_attribute('iuid')
        self.topic_url = f'https://my.hupu.com/{iuid}/topic'

    @ExceptionReporter
    def get_posts(self):
        """
        只评论二手交易区
        """
        logger.info('updating posts......')
        self.s.driver.get(self.topic_url)
        posts = self.s.driver.find_elements_by_xpath('//table[@class="mytopic topiclisttr"]//a')[:self.comment_count*2]
        links, plates = posts[::2], posts[1::2]
        for link, plate in zip(links, plates):
            if plate.text == '二手交易区':
                self.posts.put(link.get_attribute('href'))

    def up_post(self, post_url):
        """
        顶一条帖
        十条连续错误再发邮件报错
        """
        try:
            self.s.driver.get(post_url)
            self.s.driver.ensure_element_by_id('atc_content').send_keys(choice(self.commentaries))
            self.s.driver.ensure_element_by_id('fastbtn').ensure_click()
            time.sleep(randrange(60, 120))
            if 'post.php?action=reply' in self.s.driver.current_url:
                logger.error('up post error! %s', post_url)
                self.exception_recoder.append(False)
            else:
                logger.info('up post success! %s', post_url)
                self.exception_recoder.append(True)
        except:
            self.exception_recoder.append(False)
        if len(self.exception_recoder) < 10:
            return
        if any(self.exception_recoder) is True:
            self.exception_recoder.pop(0)
        else:
            send_mail()

    # @ExceptionReporter
    # def up_post(self, post_url):
    #     self.s.driver.get(post_url)
    #     self.s.driver.ensure_element_by_id('atc_content').send_keys(
    #         choice(self.commentaries))
    #     self.s.driver.ensure_element_by_id('fastbtn').ensure_click()
    #     time.sleep(randrange(60, 120))
    #     if 'post.php?action=reply' in self.s.driver.current_url:
    #         logger.error('up post error! %s', post_url)
    #         self.exception_recoder.append(False)
    #     else:
    #         logger.info('up post success! %s', post_url)
    #         self.exception_recoder.append(True)

    def is_boundary(self):
        """
        判断是否在指定时间段
        """
        now = arrow.now()
        if now.hour >= self.end_with:
            logger.info('%s 正在休眠, 明天%s点再回帖', now, self.start_at)
            time.sleep((now.shift(days=1).replace(hour=self.start_at, minute=0) - now).seconds)
        elif now.hour < self.start_at:
            time.sleep((now.replace(hour=self.start_at, minute=0) - now).seconds)
            logger.info('%s 正在休眠, %s点再回帖', now, self.start_at)

    def up_posts(self):
        """
        在指定时间段顶帖
        """
        while True:
            self.is_boundary()
            while self.posts.empty():
                self.get_posts()
            self.up_post(self.posts.get())
class Driver(object):
    def __init__(self):
        # 使用requestium的Session, 使用requests和Selenium, 设置为headless模式
        self.s = Session(
            webdriver_path='./chromedriver',
            browser='chrome',
            default_timeout=15,
            #webdriver_options={'arguments': ['headless']}
        )
        self.category_mapping = None

        path = os.path.join(os.getcwd(), FILENAME)
        if os.path.exists(path):
            self.category_mapping = ujson.load(open(path))
            #pprint(self.category_mapping)

    def close(self):
        if self.s.driver is not None:
            self.s.driver.quit()
        if self.s is not None:
            self.s.close()

    def login(self):
        """
        使用driver登录到启信宝
        """
        login_url = 'http://www.qixin.com/auth/login?return_url=%2F'
        self.s.driver.get(login_url)

        # 使用requestium中的ensure_*方法定位元素
        username_xpath = '//input[@class="form-control input-lg input-flat input-flat-user"]'
        user_element = self.s.driver.ensure_element_by_xpath(username_xpath)
        for c in USERNAME:
            # 间歇输入Username和Password
            user_element.send_keys(c)
            time.sleep(random.randint(0, 2))

        password_xpath = '//input[@class="form-control input-lg input-flat input-flat-lock"]'
        password_element = self.s.driver.ensure_element_by_xpath(
            password_xpath)
        for c in PASSWORD:
            password_element.send_keys(c)
            time.sleep(random.random())
        password_element.send_keys(Keys.ENTER)
        self.s.driver.implicitly_wait(10)

    def process_cookies(self):
        """
        使用requests抓取页面
        """
        # 将driver的cookies转给requests的session
        tmp_url = 'http://www.qixin.com/search?area.province=12&page=1&scope[]=1'
        self.s.driver.get(tmp_url)
        self.s.transfer_driver_cookies_to_session()
        self.s.copy_user_agent_from_driver()

        # 判断category mapping是否存在
        if self.category_mapping is None:
            req = self.s.get('http://www.qixin.com')
            self.category_mapping = {}
            for element in req.xpath('//div[@class="grid-item"]'):
                category_l1 = element.xpath(
                    './div/text()').extract_first().strip()
                category_l2 = element.xpath('./a/text()').extract()
                self.category_mapping[category_l1] = category_l2
                ujson.dump(self.category_mapping,
                           open(os.path.join(os.getcwd(), FILENAME), 'w'))

    def fetch_page(self):
        # 获取cookies之后,使用requests的session开始抓取数据
        result = []
        self.s.proxies.update({
            'http': 'http://forward.xdaili.cn:80',
            'https': 'https://forward.xdaili.cn:80'
        })
        for page in range(1, 11):
            url = 'http://www.qixin.com/search?area.province=12&page=%s&scope[]=1&sorter=4' % page
            self.s.headers.update({'Proxy-Authorization': sign()})
            req = self.s.get(url)
            for element in req.xpath(
                    "//div[contains(@class, 'company-item')]"):
                result.append({
                    'title':
                    element.xpath(".//div[@class='company-title']/a/text()"
                                  ).extract_first().strip(),
                    'legal_owner':
                    element.xpath(".//div[@class='legal-person'][1]/text()"
                                  ).re_first(r'法定代表人:(\w*)').strip(),
                    'status':
                    element.xpath(
                        ".//div[@class='company-tags']/span[1]/text()").
                    extract_first().strip(),
                    'capital':
                    element.xpath(".//div[contains(@class, 'col-3-1')]/text()"
                                  ).extract_first().strip(),
                    'date':
                    element.xpath(".//div[contains(@class, 'col-3-2')]/text()"
                                  ).extract_first().strip(),
                    'url':
                    element.xpath(".//div[@class='company-title']/a/@href"
                                  ).extract_first().strip()
                })
            time.sleep(10)
        return result

    def process_search_condition(self):
        """
        构建搜索条件
        * URL: http://www.qixin.com/search?
        * param 地区: area.province=12, area.district=120101-120119
        * param 搜索范围: scope[]=1
        * param 排序: sorter=3 | 4
        * param 注册资本: capital: 1-5
        * param 所属行业: industry.l1 一级行业, industry.l2 二级行业
        * param 注册年份: year: 1-5
        * param page: 页码,最大不超过500, 只能看5000条搜索结果
        http://www.qixin.com/search?area.district=120101&area.province=12&capital=2&industry.l1=%E5%86%9C%E3%80%81%E6%9E%97%E3%80%81%E7%89%A7%E3%80%81%E6%B8%94%E4%B8%9A&industry.l2=%E5%86%9C%E4%B8%9A&page=1&scope[]=1&sorter=4&year=5
        """
        pass
Esempio n. 16
0
class AuM(object):
    def __init__(self):
        # Create a session and authenticate
        self._s = Session(
            webdriver_path='/usr/lib/chromium-browser/chromedriver',
            browser='chrome',
            webdriver_options={"arguments": ["--headless"]})
        self._s.headers.update({
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0'
        })
        self._s.get('https://www.adopteunmec.com')  # Maybe not needed
        # Register a new account
        rand_s = ''.join(
            random.choice(string.ascii_lowercase + string.digits)
            for _ in range(6))
        print('email used: francois_%[email protected]' % rand_s)
        r = self._s.post('https://www.adopteunmec.com/register/index',
                         data={
                             'sex': '1',
                             'day': '03',
                             'month': '4',
                             'year': '1997',
                             'email': '*****@*****.**' % rand_s,
                             'password': '******',
                             'password_check': 'Adottami1',
                             'country': 'fr',
                             'zipcode': '06000',
                             'city': 'Nice',
                             'confirm_city': '0',
                             'pseudo': 'RedoAA',
                             'cgu': '1',
                             'reg_submit': '',
                             'by_popup': '1',
                             'PreventChromeAutocomplete': ''
                         },
                         headers={
                             "X-Requested-With": "XMLHttpRequest",
                             "Origin": "https://www.adopteunmec.com/",
                             "Referer": "https://www.adopteunmec.com/"
                         })
        status = r.json()
        # If registration were successful, go to redirect page to confirm account
        if (status['success'] == 1):
            self._s.get(status['redirect'])
        else:
            print('Something went wrong....')

        self._common_names = (
            'loic', 'marc', 'anthony', 'tom', 'jordan', 'florian', 'jean',
            'manu', 'seb', 'alex', 'lilian', 'angelo', 'fred', 'valent',
            'fabrice', 'fabien', 'nico', 'thomas', 'sylvain', 'tim', 'karim',
            'robin', 'pierre', 'arnaud', 'max', 'luc', 'mike', 'yann', 'oliv',
            'yvan', 'jerem', 'michel', 'mat', 'kev', 'damien', 'vinc', 'eric',
            'gilles', 'jona', 'bruno', 'simon', 'adri', 'serge', 'tony', 'jul',
            'quentin', 'leo', 'step', 'gab', 'david', 'paul', 'killian',
            'alvaro', 'ronan', 'anto', 'jb', 'jp', 'jon', 'patrick', 'virgile',
            'juju', 'stef', 'franck', 'alan', 'alain', 'albin', 'alban',
            'fran', 'cyril', 'laure', 'phil', 'jacques', 'jack', 'ludo',
            'chris', 'vic', 'jo', 'charles', 'geoffrey', 'igor', 'ciro',
            'erwan', 'fabio', 'guillaume', 'thibaut', 'romain', 'rafa',
            'lionel', 'cedric', 'xavier')

    def _common_name(self, name):
        return len(
            filter(lambda x: x is False,
                   map(lambda n: name.lower().find(n) < 0,
                       self._common_names))) > 0

    def search_by_region(self, age_min=20, age_max=30, region=1, sex=0):
        return self.search({
            'age[min]': age_min,
            'age[max]': age_max,
            'by': 'region',
            'region': region,
            "sex": sex
        })

    def search_by_disance(self, age_min=20, age_max=30, distance=40, sex=0):
        return self.search({
            'age[min]': age_min,
            'age[max]': age_max,
            'by': 'distance',
            'distance[max]': distance,
            "sex": sex
        })

    def search(self, criteria=None):
        if criteria is None:
            return []

        # Go to search page
        self._s.get('https://www.adopteunmec.com/mySearch')
        # POST a request
        r = self._s.post('https://www.adopteunmec.com/mySearch/save',
                         data=criteria)

        time.sleep(3)  # Wait a bit...
        # Trasnfer cookie to selenium, refresh the page, scroll to end 10 times, and get profiles
        self._s.transfer_session_cookies_to_driver()
        self._s.driver.get('https://www.adopteunmec.com/mySearch/results')
        for i in range(10):
            self._s.driver.find_element_by_tag_name('html').send_keys(Keys.END)
            time.sleep(.1)
        html = BeautifulSoup(
            self._s.driver.execute_script("return document.body.innerHTML"),
            'lxml')
        self._s.transfer_driver_cookies_to_session()
        self._s.driver.close()  # Might be done before ?

        # Look for <div> tags containing user info
        blocks = html.find_all('div', {'class': 'user-infos'})
        # Get all <a> tags in a same list
        all_a = [a for sl in [b.find_all('a') for b in blocks] for a in sl]
        # Extract profiles ID doing common name checks to avoid visit too may profiles later
        profiles = [
            l.get('href').split('/')[-1] for l in all_a
            if isinstance(l.get('href'), str)
            and l.get('href').find('profile') > 0 and len(l.get_text()) > 2
            and not self._common_name(l.get_text())
        ]
        return profiles

    def update_db(self,
                  profiles=[],
                  max_p=None,
                  filename='data/justemenemoi.json'):
        db = {}
        try:
            with open(filename, 'r') as in_f:
                db = json.load(in_f)
        except:
            pass

        visited = 0
        for uid in profiles:
            # Check if profile already in db
            if uid not in db:
                if max_p is not None and visited >= max_p:
                    break
                visited += 1

                url = "https://www.adopteunmec.com/profile/" + uid
                page = self._s.get(url)
                html = BeautifulSoup(
                    page.content.decode('utf-8', 'xmlcharrefreplace'))

                name = html.find('div', {'class': 'username'}).get_text()
                desc = html.find(text='Description').find_parent('div').find(
                    'p').get_text()
                shop = html.find(text='Shopping List').find_parent('div').find(
                    'p').get_text()
                # Profile Filtering
                if desc.find("non renseign") >= 0 or shop.find(
                        "non renseign") >= 0 or len(desc) < 20 or len(
                            shop) < 20:
                    continue

                img_url = html.find(id='img-current-pic')['src']
                img_name = img_url.split('/')[-1]
                db[uid] = {
                    "profile": url,
                    "name": name,
                    "img": img_name,
                    "age": html.find('span', {
                        'class': 'age'
                    }).get_text(),
                    "city": html.find('span', {
                        'class': 'city'
                    }).get_text(),
                    "desc": desc,
                    "shop": shop
                }

                # Download and save profile pic
                pic = self._s.get(img_url, stream=True)
                pic.raw.decode_content = True
                with open("data/pics/" + img_name, 'wb') as f:
                    shutil.copyfileobj(pic.raw, f)

                time.sleep(.5)  # Bit of rest...

        # Write back json
        json_s = json.dumps(
            db)  # Dump as a string, to write to file and as JS var
        with open(filename, 'w') as out_f:
            out_f.write(json_s)
        with open(filename + '.js', 'w') as out_f:
            out_f.write("data = ")
            out_f.write(json_s)
Esempio n. 17
0
class Charme(object):
    def __init__(self):
        # Create a session and authenticate
        self._s = Session(
            webdriver_path='/usr/lib/chromium-browser/chromedriver',
            browser='chrome')  #,
        #webdriver_options={"arguments": ["--headless"]})
        self._s.headers.update({
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0'
        })

        # Login
        r = self._s.post('https://www.adopteunmec.com/auth/login',
                         data={
                             'username': '******',
                             'password': '******'
                         })
        if not r.ok:
            raise 'Something wrong in login'
        else:
            time.sleep(2)

    def search_by_distance(self, age_min=20, age_max=30, distance=40, sex=1):
        return self.search({
            'age[min]': age_min,
            'age[max]': age_max,
            'by': 'distance',
            'distance[max]': distance,
            "sex": sex
        })

    def search(self, criteria=None):
        if criteria is None:
            return []

        # Go to search page
        self._s.get('https://www.adopteunmec.com/mySearch')
        time.sleep(1)
        # POST a request
        r = self._s.post('https://www.adopteunmec.com/mySearch/save',
                         data=criteria)

        time.sleep(3)  # Wait a bit...
        # Trasnfer cookie to selenium, refresh the page, scroll to end 10 times, and get profiles
        self._s.transfer_session_cookies_to_driver()
        self._s.driver.get('https://www.adopteunmec.com/mySearch/results')
        for i in range(10):
            self._s.driver.find_element_by_tag_name('html').send_keys(Keys.END)
            time.sleep(.1)
        html = BeautifulSoup(
            self._s.driver.execute_script("return document.body.innerHTML"),
            'lxml')
        self._s.transfer_driver_cookies_to_session()
        self._s.driver.close()  # Might be done before ?

        # Look for <div> tags containing user info
        blocks = html.find_all('div', {'class': 'user-infos'})
        # Get all <a> tags in a same list
        all_a = [a for sl in [b.find_all('a') for b in blocks] for a in sl]
        # Extract profiles ID doing common name checks to avoid visit too may profiles later
        profiles = [
            l.get('href').split('/')[-1] for l in all_a
            if isinstance(l.get('href'), str)
            and l.get('href').find('profile') > 0 and len(l.get_text()) > 2
        ]
        return profiles

    def charme(self, profiles=[], max_p=10, filename='data/charme.json'):
        db = {}
        try:
            with open(filename, 'r') as in_f:
                db = json.load(in_f)
        except:
            pass

        visited = 0
        for uid in profiles:
            # Check if profile already in db
            if uid not in db:
                if max_p is not None and visited >= max_p:
                    break
                visited += 1

                url = "https://www.adopteunmec.com/profile/" + uid
                print "Visiting", url
                page = self._s.get(url)
                html = BeautifulSoup(
                    page.content.decode('utf-8', 'xmlcharrefreplace'), 'lxml')

                img_url = html.find(id='img-current-pic')['src']
                img_name = img_url.split('/')[-1]
                date = datetime.datetime.now().strftime("%m-%d %H:%M")
                db[uid] = {
                    "profile":
                    url,
                    "name":
                    html.find('div', {
                        'class': 'username'
                    }).get_text(),
                    "img":
                    img_name,
                    "age":
                    html.find('span', {
                        'class': 'age'
                    }).get_text(),
                    "city":
                    html.find('span', {
                        'class': 'city'
                    }).get_text(),
                    "desc":
                    html.find(text='Description').find_parent('div').find(
                        'p').get_text(),
                    "shop":
                    html.find(text='Shopping List').find_parent('div').find(
                        'p').get_text(),
                    "charmed":
                    date
                }

                # Download and save profile pic
                pic = self._s.get(img_url, stream=True)
                pic.raw.decode_content = True
                with open("data/pics/" + img_name, 'wb') as f:
                    shutil.copyfileobj(pic.raw, f)

                time.sleep(20)  # Bit of rest...

                # Send a charme
                url = "https://www.adopteunmec.com/events/charm?id=" + uid
                r = self._s.get(url)
                if r.json()['member']['id'] != uid:
                    raise 'Something wrong in response'

        # Write back json
        json_s = json.dumps(
            db)  # Dump as a string, to write to file and as JS var
        with open(filename, 'w') as out_f:
            out_f.write(json_s)