class InstagramGeoFinder:
    def __init__(self):
        # username = os.environ["instagram_username"]
        # password = os.environ["instagram_password"]
        self.settings_file = "instagram_client"

        device_id = None
        try:
            if not os.path.isfile(self.settings_file):
                self.api = Client(auto_patch=True, authenticate=False)
            else:
                with open(self.settings_file) as file_data:
                    cached_settings = json.load(file_data,
                                                object_hook=from_json)
                print('Reusing settings: {0!s}'.format(self.settings_file))

                device_id = cached_settings.get('device_id')
                # reuse auth settings
                self.api = Client(username="******",
                                  password="******",
                                  settings=cached_settings)

        except (ClientCookieExpiredError, ClientLoginRequiredError) as e:

            # Login expired
            # Do relogin but use default ua, keys and such
            self.api = Client(
                username="******",
                password="******",
                device_id=device_id,
                on_login=lambda x: onlogin_callback(x, self.settings_file))

    def find_geo(self, location_id, count):
        result_photo = []
        rank_token = Client.generate_uuid()
        location_info = self.api.location_info(location_id)
        self.file_name = location_info["location"]["name"]

        result_location = self.api.location_section(location_id,
                                                    rank_token,
                                                    tab='recent')

        for item in result_location["sections"]:
            result_photo.append(item)

        next_max_id = result_location["next_max_id"]

        while next_max_id:
            result_location = self.api.location_section(location_id,
                                                        rank_token,
                                                        tab='recent',
                                                        max_id=next_max_id)

            for item in result_location["sections"]:
                result_photo.append(item)

            if len(result_photo) >= count:  # get only first 600 or so
                break
            try:
                next_max_id = result_location["next_max_id"]
            except KeyError:
                break

        logging.info(len(result_photo))
        return result_photo

    def get_filename(self):
        filename = "instagram_%s" % self.file_name

        filename += ".html"
        return filename

    def save_to_file(self, array_photo):
        logging.info("Save to html")
        html_str = """
                <!DOCTYPE html>
                <html>
                 <head>
                  <meta charset="utf-8">
                  <title>Kiski Finder</title>
                 </head>
                 <body> 
                <table border=1>
                     <tr>
                      <th>Kiska</th>
                      <th>FullName</th>
                       <th>Time and Description</th>
                       
                      
                       <th>User</th>
                       <th>Photo</th>
                     </tr>
                     <indent>

                       """

        html_end = """
                     </indent>
                </table>
                 </body>
                </html>
                """
        table = ""
        for layout in array_photo:
            for item in layout["layout_content"]["medias"]:
                try:
                    time = int(item["media"]["caption"]["created_at"])
                    photo_time = datetime.datetime.utcfromtimestamp(
                        time).strftime('%Y-%m-%d %H:%M:%S')
                    photo_time += " </br> %s" % item["media"]["caption"]["text"]
                except TypeError:
                    photo_time = "-"

                table += "<tr>"
                table += str.format(
                    "<td><a href=http://instagram.com/{0}/>{0}</a></td>",
                    item["media"]["user"]["username"])
                table += str.format("<td>{0}</td>",
                                    item["media"]["user"]["full_name"])
                table += str.format(
                    '<td width="200" style="word-break: break-all;">{0}</td>',
                    photo_time)

                table += """<td> <img src=""" + item["media"]["user"][
                    "profile_pic_url"] + """ width="255" height="255" alt="lorem"> </td>"""

                table += "<td>"
                index = 0

                try:
                    table += """<img src=""" + item["media"][
                        "image_versions2"]["candidates"][0][
                            "url"] + """ width="255" height="255" alt="lorem">"""
                except KeyError:
                    table += """<img src=""" + item["media"]["carousel_media"][
                        0]["image_versions2"]["candidates"][0][
                            "url"] + """ width="255" height="255" alt="lorem">"""

                index += 1
                if index > 10:
                    break

            table += "</td>"

            table += "</tr>"

        if not os.path.isdir("result_html"):
            os.mkdir("result_html")

        with open("result_html//" + self.get_filename(), 'w',
                  encoding="utf-8") as file:
            file.write(html_str + table + html_end)
        # Сохраним в HTML +
        return "result_html/" + self.get_filename()
Beispiel #2
0
class Scraper:
    """Class for extracting data from Instagram"""
    def __init__(self):
        """

        :param api: API client
        :param app_id: here app id
        :param app_code: here app code
        :param stories_found: list of stories found
        :param users_found: list of users found
        :param location_categories: dict that maps a location to its category
        """
        self.api = None
        self.app_id = None
        self.app_code = None
        self.stories_found = []
        self.users_found = []
        self.locations_categories = {}
        self.setLogging()

    def login(self, username: str, password: str):
        """Logs to Instagram

        :param username: Instagram login name
        :param password: Instagram login password
        """
        self.api = Client(username, password)

    def setHereApp(self, app_id: str, app_code: str):
        """Sets Here app id and code

        :param app_id: here app id
        :param app_code: here app code 
        """
        self.app_id = app_id
        self.app_code = app_code

    def findStories(self, source_id: int, filename: str):
        """Searches stories made in a location and saves them in a file

        :param source_id: instagram source location id
        :param filename: file where to save stories
        """
        try:
            results = self.api.location_stories(source_id)
            items = results['story']['items']
            for item in items:
                item_id = item['id']
                if item_id in self.stories_found:
                    continue
                self.stories_found.append(item_id)
                userid = item['user']['pk']
                timestamp = item['expiring_at'] - 86400
                with open(filename, 'a') as file:
                    writer = csv.writer(file,
                                        delimiter='|',
                                        quoting=csv.QUOTE_MINIMAL)
                    for story_location in item['story_locations']:
                        location = story_location['location']
                        location_id = location['pk']
                        location_name = location['name']
                        writer.writerow(
                            [userid, timestamp, location_id, location_name])
        except Exception as e:
            self.logger.error(e)

    def categorize(self, source_file: str, target_file: str):
        """Searches locations' categories

        :param source_file: file with locations to find categories
        :param target_file: file where to write locations along their categories
        """
        with open(source_file, 'r') as inp, open(target_file, 'a') as out:
            reader = csv.reader(inp, delimiter='|')
            writer = csv.writer(out, delimiter='|')
            for row in reader:
                userid = row[0]
                timestamp = row[1]
                location_id = row[2]
                location_name = row[3]
                try:
                    if (location_id not in self.locations_categories):
                        info = self.api.location_info(location_id)
                        lat = info['location']['lat']
                        lng = info['location']['lng']

                        category = ''
                        endpoint = 'https://places.cit.api.here.com/places/v1/autosuggest?at=' + \
                            str(lat) + ',' + str(lng) + '&q=' + location_name + \
                            '&app_id=' + self.app_id + '&app_code=' + self.app_code
                        r = requests.get(endpoint)
                        json = r.json()
                        results = json["results"]
                        if (len(results) > 0):
                            category = results[0]['category']
                        self.locations_categories[location_id] = category
                        writer.writerow([
                            userid, timestamp, location_id, location_name,
                            self.locations_categories[location_id]
                        ])
                except Exception as e:
                    self.logger.error(e)
                    continue

    def setLogging(self):
        """Sets logger handler and formatters
        """
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        errorLogHandler = handlers.RotatingFileHandler('error.log',
                                                       maxBytes=5000,
                                                       backupCount=0)
        errorLogHandler.setLevel(logging.ERROR)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        errorLogHandler.setFormatter(formatter)
        self.logger.addHandler(errorLogHandler)