Ejemplo n.º 1
0
def extractNewPostsFromIG(todayHT):
    #Search  hashtag
    looter = HashtagLooter(
        todayHT)  #Assume extraction is once a day. Dw to do time v mafan

    # Create a df that contains new posts
    appendDF = pd.DataFrame(columns=[
        'username', 'date', 'time', 'text', 'photo', 'is_video', 'points',
        'hashtags'
    ])
    index = 0

    # Make each new post as a new row
    for onePost in looter.medias():
        onePostDict = looter.get_post_info(onePost.get('shortcode'))
        appendDF.at[index,
                    'username'] = (onePostDict.get('owner')).get('username')
        appendDF.at[index, 'date'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%Y/%m/%d")
        appendDF.at[index, 'time'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%H:%M:%S")
        appendDF.at[index,
                    'text'] = ((((onePostDict.get('edge_media_to_caption')
                                  ).get('edges'))[0]).get('node')).get('text')
        appendDF.at[index, 'photo'] = onePostDict.get('display_url')
        appendDF.at[index, 'is_video'] = onePostDict.get(
            'is_video')  # returns True or False
        appendDF.at[
            index,
            'hashtags'] = todayHT  #this one go crawl from the other database

        index += 1

    # We will do sorting later
    return appendDF
Ejemplo n.º 2
0
    def post(self, hashtag_):
        looter = HashtagLooter("hashtag_")

        with open("hashtag/" + hashtag_ + ".txt", "w") as f:
            for media in looter.medias():
                for link in links(media, looter):
                    f.write("{}\n".format(link))
        return "ok", 201
Ejemplo n.º 3
0
    def test_timeframe_datetime(self):
        looter = HashtagLooter("protein")
        now = datetime.datetime.now()
        timeframe = now - datetime.timedelta(5), now - datetime.timedelta(7)
        media = next(looter.medias(timeframe=timeframe))

        taken_at = datetime.datetime.fromtimestamp(media["taken_at_timestamp"])
        self.assertLessEqual(taken_at, max(timeframe))
        self.assertGreaterEqual(taken_at, min(timeframe))
Ejemplo n.º 4
0
def extractAllPostsFromIG(mainHT, commonHT, allHT):
    #Search hashtag
    looter = HashtagLooter(mainHT)

    # Create a df that contains new posts
    appendDF = pd.DataFrame(columns=[
        'username', 'date', 'time', 'text', 'photo', 'is_video', 'points',
        'hashtags'
    ])
    index = 0

    # Make each new post as a new row
    for onePost in looter.medias():
        onePostDict = looter.get_post_info(onePost.get('shortcode'))
        appendDF.at[index,
                    'username'] = (onePostDict.get('owner')).get('username')
        appendDF.at[index, 'date'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%Y/%m/%d")
        appendDF.at[index, 'time'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%H:%M:%S")
        appendDF.at[index,
                    'text'] = ((((onePostDict.get('edge_media_to_caption')
                                  ).get('edges'))[0]).get('node')).get('text')
        appendDF.at[index, 'photo'] = onePostDict.get('display_url')
        appendDF.at[index, 'is_video'] = onePostDict.get(
            'is_video')  # returns True or False
        appendDF.at[
            index,
            'points'] = 0  #this one go crawl from the other database with HT

        # =============================================================================
        # If the single post contains more than one todayHT
        # =============================================================================
        text = ((((onePostDict.get('edge_media_to_caption')).get('edges'))[0]
                 ).get('node')).get('text')

        totalPoints = 0
        if (text.count(commonHT) > 1):  #many hashtags
            manyHT = []  # create a list to store a list of hashtags
            for HT in allHT:  #loop through all the hashtags
                if HT in text:
                    manyHT.append(HT)
                    pointsHT = extractTodayPointsFromGS('DailyChallenges', HT)
                    totalPoints += pointsHT
            appendDF.at[index, 'points'] = totalPoints
            appendDF.at[index, 'hashtags'] = manyHT

        else:  #one hashtag
            for HT in allHT:  #loop through all the hashtags
                if HT in text:
                    appendDF.at[index, 'hashtags'] = HT
                    pointsHT = extractTodayPointsFromGS('DailyChallenges', HT)
                    appendDF.at[index, 'points'] = pointsHT
        index += 1

    # We will do sorting later
    return appendDF
Ejemplo n.º 5
0
    def test_issue_076(self):
        """Thanks to @zeshuaro for reporting this bug.

        Check that when downloading hashtags, the downloader
        actually stops.
        """
        looter = HashtagLooter("oulianov", session=self.session)

        medias_it = looter.medias()
        postcount = length_hint(medias_it)

        for i, m in enumerate(medias_it):
            if i > postcount:
                self.fail("looter.medias() did not stop.")
Ejemplo n.º 6
0
def get_hashtag(hashtag):
    print("get hashtag : %s" % hashtag)
    looter = HashtagLooter(hashtag)
    medias = islice(looter.medias(), 100)
    res = list(map(lambda m: media_to_dict(m, hashtag), medias))
    return { "res": res }
Ejemplo n.º 7
0
 def test_timeframe_date(self):
     looter = HashtagLooter("protein")
     timeframe = datetime.date(2019, 12, 27), datetime.date(2019, 12, 20)
     media = next(looter.medias(timeframe=timeframe))