Ejemplo n.º 1
0
    def test_submissions_between_order(self):
        all_subs = list(submissions_between(self.r,
                                            self.sr,
                                            highest_timestamp=time.time(),
                                            verbosity=self.verbosity))

        for i in range(len(all_subs) - 1):
            self.assertGreaterEqual(all_subs[i].created_utc,
                                    all_subs[i + 1].created_utc)

        sr_obj = self.r.get_subreddit(self.sr)
        all_subs_sr_object = list(
            submissions_between(self.r,
                                sr_obj,
                                verbosity=self.verbosity)
        )

        self.assertEqual(all_subs, all_subs_sr_object)

        all_subs_reversed = list(submissions_between(self.r,
                                                     sr_obj,
                                                     newest_first=False,
                                                     verbosity=self.verbosity))

        self.assertEqual(all_subs, list(reversed(all_subs_reversed)))
 def test_submissions_between_raises_correctly(self):
     with self.assertRaises(PRAWException):
         list(
             submissions_between(self.r,
                                 self.sr,
                                 extra_cloudsearch_fields={'self': 'yes'},
                                 verbosity=self.verbosity))
Ejemplo n.º 3
0
 def test_submissions_between_special_subreddit(self):
     for submission in submissions_between(
             self.r, "all",  # so it doesn't take forever
             highest_timestamp=1121000000,
             verbosity=self.verbosity):
         self.assertLessEqual(submission.created_utc, 1121000000)
         self.assertGreaterEqual(submission.created_utc, 0)
Ejemplo n.º 4
0
def main(*args):
    """ 
    Main loop of scraping iteration.
        
    Arguments: 
    terms is a string of search terms
    subreddit is a string representing subreddit to search in

    Returns: 
    None (write to file instead)
    """

    if len(args) == 1:
        terms = args[0]
        subreddit = 'all'
    elif len(args) == 2:
        terms = args[0]
        subreddit = args[1]
    else:
        print ('Usage: python reddit_scrape.py "<search terms>" '
               '[<subreddit>]')
        return
    
    output_file = "__".join(['_'.join(terms.split()),
                             subreddit]) + '.json'
    
    # initialize Reddit session (see praw.ini for OAuth details)
    r = praw.Reddit(user_agent='Full Scraper by '
                    '/u/<your_username_here> version 1.0',
                    site_name='my_app')

    # gather refresh token from praw.ini
    try:
        r.refresh_access_information()
    except OAuthAppRequired:
        print "Check your OAuth login information (see ./praw.ini)."

    # get list of search results with the given terms
    search_results = list(submissions_between(
        r,
        subreddit,
        extra_cloudsearch_fields={'selftext': terms}
        )
    )

    # get comments for each submission above
    # and pull the ones that match the search
    search_results_with_comments = get_comments(search_results)

    # get list of cleaned search results
    cleaned_results = clean_results(search_results_with_comments,
                                    terms)
    
    # write results to file in directory ./output/
    write_results(cleaned_results, terms, output_file)
Ejemplo n.º 5
0
def get_past_comments(lowest_timestamp = None, highest_timestamp = None, from_oldest = False):
    """gets comments that were made before the given time"""
    
    if from_oldest:
        highest_timestamp = find_oldest()
    subs = submissions_between(r, "soccer", highest_timestamp = highest_timestamp, 
                                lowest_timestamp = lowest_timestamp)
    comments = (sub.comments for sub in subs)
    
    for sub_comments in comments:
        for comment in sub_comments:
            if not isinstance(comment, MoreComments):
                yield Comment(comment)
            elif isinstance(comment, MoreComments):
                comments = comment.comments
def get_subreddit_stats(sublist, subreddit_stats):
    for sub in sublist:
        if str(sub) in subreddit_stats:
            continue
        downvoted_submissions = 0
        total_submissions = 0
        for submission in submissions_between(r,
                                              sub,
                                              lowest_timestamp=lowest_timestamp,
                                              highest_timestamp=highest_timestamp,
                                              verbosity=0):
            assert submission.created_utc <= highest_timestamp and submission.created_utc >= lowest_timestamp
            if submission.score <= 0:
                downvoted_submissions += 1
            total_submissions += 1
        subreddit_stats[sub] = {"downvoted_submissions": downvoted_submissions,
                                "total_submissions": total_submissions}
        subreddit_stats.sync()
        print sub, total_submissions, downvoted_submissions, downvoted_submissions / float(total_submissions) if total_submissions else 0
Ejemplo n.º 7
0
def get_flair_stats():
    user = os.environ['REDDIT_USERNAME']
    user_agent = 'Calculating ignored bugs by {}'.format(user)

    r = praw.Reddit(user_agent)
    flair_stats = defaultdict(lambda: defaultdict(lambda: 0))

    for s in submissions_between(r, 'bugs', lowest_timestamp=1400000000):
        created = datetime.utcfromtimestamp(s.created_utc)
        month = (created.year, created.month)
        # They started to add flairs since Janury, 2015
        if month < (2015, 2):
            break
        # Current month has incomplete data
        if month == (date.today().year, date.today().month):
            continue
        # Submissions without flairs seems to be mainly duplicate submissions removed by mods
        # They are not viewable in the interface, so we aren't counting them
        if not s.link_flair_text:
            print "IGNORING POST WITHOUT A FLAIR", s.permalink, s.title
            continue
        flair_stats[month][s.link_flair_text] += 1

    return flair_stats
Ejemplo n.º 8
0
    def test_submissions_between_with_filters(self):
        all_subs = list(submissions_between(self.r,
                                            self.sr,
                                            verbosity=self.verbosity))
        t1 = 1420000000
        t2 = 1441111111

        t1_t2_subs = list(submissions_between(self.r,
                                              self.sr,
                                              lowest_timestamp=t1,
                                              highest_timestamp=t2,
                                              verbosity=self.verbosity))

        def filter_subs(subs,
                        lowest_timestamp=0,
                        highest_timestamp=10**10,
                        criterion=None):
            filtered = [s for s in subs
                        if s.created_utc <= highest_timestamp and
                        s.created_utc >= lowest_timestamp and
                        (criterion is None or criterion(s))]
            # make sure we never accidentally craft a bad test case
            self.assertGreater(len(filtered), 0)
            return filtered

        t1_t2_subs_canon = filter_subs(all_subs, t1, t2)
        self.assertEqual(t1_t2_subs, t1_t2_subs_canon)

        self_subs = list(
            submissions_between(self.r,
                                self.sr,
                                extra_cloudsearch_fields={"self": "1"},
                                verbosity=self.verbosity)
        )
        self_subs_canon = filter_subs(all_subs,
                                      criterion=lambda s: s.is_self)
        self.assertEqual(self_subs, self_subs_canon)

        def wa_criterion(s):
            return not s.is_self and \
                urlparse(s.url).netloc == "web.archive.org"

        wa_cs_fields = {"self": "0",
                        "site": "web.archive.org"}

        subs_wa = list(
            submissions_between(self.r,
                                self.sr,
                                extra_cloudsearch_fields=wa_cs_fields,
                                verbosity=self.verbosity)
        )

        subs_wa_canon = filter_subs(all_subs, criterion=wa_criterion)
        self.assertEqual(subs_wa, subs_wa_canon)

        patu_cs_fields = {"self": "1",
                          "author": "PyAPITestUser2",
                          "title": "test"}

        def patu_criterion(s):
            return s.is_self and \
                s.author.name == "PyAPITestUser2" and\
                "test" in s.title.lower()

        subs_patu = list(
            submissions_between(self.r,
                                self.sr,
                                extra_cloudsearch_fields=patu_cs_fields,
                                verbosity=self.verbosity)
        )

        subs_patu_canon = filter_subs(all_subs, criterion=patu_criterion)
        self.assertEqual(subs_patu, subs_patu_canon)
 def test_submissions_between_raises_correctly(self):
     with self.assertRaises(PRAWException):
         list(submissions_between(self.r,
                                  self.sr,
                                  extra_cloudsearch_fields={'self': 'yes'},
                                  verbosity=self.verbosity))
Ejemplo n.º 10
0
def scrape():
    reddit_obj = praw.Reddit(user_agent="Fake Images Scraper")
    psbattle = reddit_obj.get_subreddit(
        "photoshopbattles")  #subreddit of interest
    #submissions = psbattle.get_top_from_year(limit=None)#, params={"after" : "t3_4cd8qr"})
    submissions = submissions_between(
        reddit_obj, psbattle, highest_timestamp=1416475603.0
    )  #leave lowest_timestamp and highest_timestamp blank to get all submissions

    img_count = 0
    count_submissions = 1
    img_count_train = 0
    img_count_test = 0
    link_file_train = open(LINK_FILE_TRAIN, "w")
    link_file_test = open(LINK_FILE_TEST, "w")

    while img_count < NUM_LINKS_TO_GET:  #TODO need this?

        for submission in submissions:  #go through each submission
            try:
                if img_count > NUM_LINKS_TO_GET:
                    break
                print "next submission: ", submission
                if not submission:
                    print "Not a submission"
                else:
                    print "submission id: ", submission.id
                    print "submission timestamp: ", submission.created_utc
                    #decide if images from this submission will be for training or test
                    link_file = None
                    testing = False  #are we adding images to testing or training

                    #add first images to test (avoid divide by 0)
                    #then add to test if we have more than 5x number of training images as testing
                    #NOTE: add all images from a submission to train or test
                    #otherwise classifier might have very similar images in training dataset
                    if img_count_test == 0 or img_count_train / img_count_test >= 5:
                        testing = True
                        link_file = link_file_test
                    else:
                        link_file = link_file_train
                    #get all comments for now (note: may cause dataset imbalance?
                    #also takes longer because more API calls)
                    submission.replace_more_comments(
                        limit=None, threshold=0)  #limit=None for all comments
                    comments = [comment for comment in submission.comments if \
                    not isinstance(comment, praw.objects.MoreComments) and comment.is_root][:NUM_IMGS_PER_COMMENT] #look for at most 10 images a comment

                    count_comments = 1
                    for comment in comments:  #each (root) comment (containing image)
                        #if we've made it this far assume image is original
                        links = find_links(
                            comment.body)  #get links (presumably of images)

                        #this link is valid so download image at this link
                        for link in links:
                            if "](" in link:  #correct mistake made by regex: sometimes get http://imgur.com...](http://imgur.com...)
                                link = link[:link.index(
                                    "]"
                                )]  #get only up to and not including the ]
                            link = link.replace(
                                ")", ""
                            )  #because sometimes Imgur links have trailing )
                            link_file.write(link + " ")
                            link_file.write(submission.id + " ")
                            link_file.write(str(count_comments) + " ")
                            link_file.write(str(img_count + 1) + "\n")
                            img_count += 1
                            if testing:
                                img_count_test += 1
                            else:
                                img_count_train += 1

                        count_comments += 1  #count comment
                    count_submissions += 1  #count this submission as a new one
                    print("%d valid comments on submission %d. Now %d image links total: %d train, %d test" \
                      % (count_comments - 1, count_submissions - 1, img_count, img_count_train, img_count_test))
            except Exception as e:
                print "exception: ", e
        break

    #finish up
    link_file_train.close()
    link_file_test.close()
    print("%d image links scraped in total" % img_count)
def scrape():
  reddit_obj = praw.Reddit(user_agent = "Fake Images Scraper")
  psbattle = reddit_obj.get_subreddit("photoshopbattles") #subreddit of interest
  #submissions = psbattle.get_top_from_year(limit=None)#, params={"after" : "t3_4cd8qr"})
  submissions = submissions_between(reddit_obj, psbattle, highest_timestamp = 1416475603.0) #leave lowest_timestamp and highest_timestamp blank to get all submissions

  img_count = 0 
  count_submissions = 1 
  img_count_train = 0
  img_count_test = 0
  link_file_train = open(LINK_FILE_TRAIN, "w")
  link_file_test = open(LINK_FILE_TEST, "w")

  while img_count < NUM_LINKS_TO_GET: #TODO need this?

    for submission in submissions: #go through each submission
      try:
        if img_count > NUM_LINKS_TO_GET:
          break
        print "next submission: ", submission
        if not submission:
          print "Not a submission"
        else:
          print "submission id: ", submission.id
          print "submission timestamp: ", submission.created_utc
          #decide if images from this submission will be for training or test
          link_file = None
          testing = False #are we adding images to testing or training

          #add first images to test (avoid divide by 0)
          #then add to test if we have more than 5x number of training images as testing
          #NOTE: add all images from a submission to train or test
          #otherwise classifier might have very similar images in training dataset
          if img_count_test == 0 or img_count_train / img_count_test >= 5: 
            testing = True
            link_file = link_file_test
          else:
            link_file = link_file_train
          #get all comments for now (note: may cause dataset imbalance?
          #also takes longer because more API calls)
          submission.replace_more_comments(limit=None, threshold = 0) #limit=None for all comments
          comments = [comment for comment in submission.comments if \
          not isinstance(comment, praw.objects.MoreComments) and comment.is_root][:NUM_IMGS_PER_COMMENT] #look for at most 10 images a comment

          count_comments = 1
          for comment in comments: #each (root) comment (containing image)
            #if we've made it this far assume image is original
            links = find_links(comment.body) #get links (presumably of images)

            #this link is valid so download image at this link
            for link in links:
              if "](" in link: #correct mistake made by regex: sometimes get http://imgur.com...](http://imgur.com...)
                link = link[:link.index("]")] #get only up to and not including the ]
              link = link.replace(")","") #because sometimes Imgur links have trailing )
              link_file.write(link + " ")
              link_file.write(submission.id + " ")
              link_file.write(str(count_comments) + " ")
              link_file.write(str(img_count + 1) + "\n")
              img_count += 1
              if testing:
                img_count_test += 1
              else:
                img_count_train += 1

            count_comments += 1 #count comment
          count_submissions += 1 #count this submission as a new one
          print("%d valid comments on submission %d. Now %d image links total: %d train, %d test" \
            % (count_comments - 1, count_submissions - 1, img_count, img_count_train, img_count_test))
      except Exception as e:
        print "exception: ", e
    break

  #finish up
  link_file_train.close()
  link_file_test.close()
  print("%d image links scraped in total" % img_count)