Esempio n. 1
0
    def fetch_submission(self, slug):
        print(f"🤖 Fetching submission for problem: {slug}")
        query_params = {
            'operationName': "Submissions",
            'variables': {"offset": 0, "limit": 20, "lastKey": '', "questionSlug": slug},
            'query': '''query Submissions($offset: Int!, $limit: Int!, $lastKey: String, $questionSlug: String!) {
                                        submissionList(offset: $offset, limit: $limit, lastKey: $lastKey, questionSlug: $questionSlug) {
                                        lastKey
                                        hasNext
                                        submissions {
                                            id
                                            statusDisplay
                                            lang
                                            runtime
                                            timestamp
                                            url
                                            isPending
                                            __typename
                                        }
                                        __typename
                                    }
                                }'''
        }

        resp = self.session.post("https://leetcode.com/graphql",
                                 data=json.dumps(query_params).encode('utf8'),
                                 headers={
                                     "content-type": "application/json",
                                 })
        body = json.loads(resp.content)

        # parse data
        submissions = get(body, "data.submissionList.submissions")
        if len(submissions) > 0:
            for sub in submissions:
                if Submission.get_or_none(Submission.id == sub['id']) is not None:
                    continue

                if sub['statusDisplay'] == 'Accepted':
                    url = sub['url']
                    html = self.session.get(f'https://leetcode.com{url}').text

                    pattern = re.compile(
                        r'submissionCode: \'(?P<code>.*)\',\n  editCodeUrl', re.S
                    )

                    matched = pattern.search(html)
                    code = matched.groupdict().get('code') if matched else None
                    if code:
                        Submission.insert(
                            id=sub['id'],
                            slug=slug,
                            language=sub['lang'],
                            created=sub['timestamp'],
                            source=code.encode('utf-8')
                        ).execute()
                    else:
                        raise Exception(f"Cannot get submission code for problem: {slug}")
        random_wait(10, 15)
Esempio n. 2
0
    def fetch_problem(self, slug, accepted=False):
        print(f"🤖 Fetching problem: https://leetcode.com/problem/{slug}/...")
        query_params = {
            'operationName': "getQuestionDetail",
            'variables': {'titleSlug': slug},
            'query': '''query getQuestionDetail($titleSlug: String!) {
                        question(titleSlug: $titleSlug) {
                            questionId
                            questionFrontendId
                            questionTitle
                            questionTitleSlug
                            content
                            difficulty
                            stats
                            similarQuestions
                            categoryTitle
                            topicTags {
                            name
                            slug
                        }
                    }
                }'''
        }

        resp = self.session.post(
            "https://leetcode.com/graphql",
            data=json.dumps(query_params).encode('utf8'),
            headers={
                "content-type": "application/json",
            })
        body = json.loads(resp.content)

        # parse data
        question = get(body, 'data.question')

        Problem.replace(
            id=question['questionId'], display_id=question['questionFrontendId'], title=question["questionTitle"],
            level=question["difficulty"], slug=slug, description=question['content'],
            accepted=accepted
        ).execute()

        for item in question['topicTags']:
            if Tag.get_or_none(Tag.slug == item['slug']) is None:
                Tag.replace(
                    name=item['name'],
                    slug=item['slug']
                ).execute()

            ProblemTag.replace(
                problem=question['questionId'],
                tag=item['slug']
            ).execute()
        random_wait(10, 15)
Esempio n. 3
0
def slowly_gather():
    max_range = 10000
    increment = 1
    current_start_page = 1 + len(
        [f for f in os.listdir('scraping') if '.xlsx' in f]) * increment

    while current_start_page < max_range:
        print('Start Page %s' % current_start_page)
        parser = AllRecipesParser(start_page=current_start_page,
                                  search_limit=increment)
        parser.main()
        current_start_page += increment
        random_wait(60)
Esempio n. 4
0
    def fetch_solution(self, slug):
        print(f"🤖 Fetching solution for problem: {slug}")
        query_params = {
            "operationName": "QuestionNote",
            "variables": {"titleSlug": slug},
            "query": '''
            query QuestionNote($titleSlug: String!) {
                question(titleSlug: $titleSlug) {
                    questionId
                    article
                    solution {
                      id
                      content
                      contentTypeId
                      canSeeDetail
                      paidOnly
                      rating {
                        id
                        count
                        average
                        userRating {
                          score
                          __typename
                        }
                        __typename
                      }
                      __typename
                    }
                    __typename
                }
            }
            '''
        }
        resp = self.session.post("https://leetcode.com/graphql",
                                 data=json.dumps(query_params).encode('utf8'),
                                 headers={
                                     "content-type": "application/json",
                                 })
        body = json.loads(resp.content)

        # parse data
        solution = get(body, "data.question")
        if solution['solution']['paidOnly'] is False:
            Solution.replace(
                problem=solution['questionId'],
                url=f"https://leetcode.com/articles/{slug}/",
                content=solution['solution']['content']
            ).execute()
        random_wait(10, 15)
 def get_urls_to_parse(self):
     search_url = self.base_url + '/' + self.base_search_page
     pageno = self.start_page
     search_page = self.call_function(requests.get,
                                      url=search_url + str(pageno),
                                      headers=HEADERS)
     if search_page == 'Error':
         return
     while search_page != 'Error' and search_page.status_code != 404 and self.search_limit > (
             pageno - self.start_page):
         if search_page.status_code == 503:  # temporary off
             random_wait(60)
             continue
         soup = BeautifulSoup(search_page.content, features='html.parser')
         self.parse_search_page(soup)
         pageno += 1
         random_wait(15)
         search_page = self.call_function(requests.get,
                                          url=search_url + str(pageno),
                                          headers=HEADERS)
 def collect_articles(self):
     local_folder = 'html-downloads' + '\\' + self.base_url.split(
         'www')[-1].strip('.')
     if not os.path.exists(local_folder):
         os.makedirs(local_folder)
     for url in self.data:
         local_html = local_folder + '\\' + url.replace(
             self.base_url, '').strip('/').replace('/', '-') + '.html'
         if os.path.exists(local_html):
             print("Using Local Copy %s" % url)
             with open(local_html, 'rb') as html:
                 content = html.read()
         else:
             random_wait(10)  # only need to chill when using live
             print("Using live copy %s" % url)
             resp = self.call_function(requests.get,
                                       url=url,
                                       headers=HEADERS)
             if resp == 'Error':
                 random_wait(20)
                 continue
             if resp.status_code == 404:
                 print("Cannot find %s" % url)
                 continue
             if resp.status_code == 503:
                 random_wait(20)
                 continue
             content = resp.content
             if not os.path.exists(os.path.dirname(local_html)):
                 os.makedirs(os.path.dirname(local_html))
             with open(local_html, 'wb') as html:
                 html.write(content)
         soup = BeautifulSoup(content)
         try:
             if soup.contents[
                     2] == self.alt_flag:  # todo implement alt parser
                 print('Alt flagged', local_html)
                 continue
             info = self.parse_article_page(soup)
         except Exception as wow:
             print('Serious error with url %s, : %s' % (url, wow))
             continue
         self.data[url].update(info)
Esempio n. 7
0
def update_to_goodreads(entries, cookies, disk_cache, limit, wait):
    """Update book entries to Goodreads.

    :param entries: list of books
    :param cookies: login cookie for Goodreads
    :param disk_cache: cache of updated books
    """

    session = requests.Session()

    success = []
    error = []

    for entry in entries:
        isbn13 = entry['isbn13']

        isbns = [isbn13]
        try:
            isbn10 = pyisbn.convert(isbn13)
            isbns.append(isbn10)
        except Exception:
            pass

        resp = check_exists(session, (isbn10, isbn13), cookies)
        if not resp:
            logging.warning('{} couldn\'t be found'.format(repr_book(entry)))
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            random_wait(2)
            continue

        url = get_edit_url(resp)
        if not url:
            logging.warning('{}\' url is not found'.format(repr_book(entry)))
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            random_wait(2)
            continue

        submit_url, form_data = get_form_data(session, cookies, url)
        if not form_data:
            logging.warning('{}\' form data is not found'.format(repr_book(
                entry)))
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            random_wait(2)
            continue

        # Do not cause any updates
        form_data['review[cog_explicit]'] = '0'
        for key in ('add_to_blog', 'add_update'):
            if key in form_data:
                form_data[key] = '0'

        # sanity check
        if len([key for key in form_data if 'readingSessionDatePicker' in key
                ]) != 10:
            logging.warning('{}\' date is problematic'.format(repr_book(
                entry)))
            logging.warning(form_data)
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'
            continue

        if update_book(entry, form_data, submit_url, session, cookies):
            success.append(entry)
            disk_cache[entry['isbn13']] = ''
        else:
            error.append(entry)
            disk_cache[entry['isbn13']] = 'e'

        if limit is not None and len(success) >= limit:
            break

        random_wait()

    return success, error