def write_timemap(self, turl=None, tm_content=None):
        """
        This is function to write TimeMap.

        Parameters:
            turl (str): Twitter URL
            tm_content (str): TimeMap Content
        Returns:
            (bool): True on Success and False on Failure
        """
        tresponse = Utils.get_turl_info(turl)
        tmpath = self.__timemap_dir
        if not os.path.exists(tmpath):
            os.mkdir(tmpath)
        tmpath = os.path.join(tmpath, tresponse["handle"].lower())
        if not os.path.exists(tmpath):
            os.mkdir(tmpath)
        tmpath = os.path.join(tmpath, tresponse["domain"])
        if not os.path.exists(tmpath):
            os.mkdir(tmpath)
        tmpath = os.path.join(tmpath, tresponse["wrep"] + tresponse["lang"])
        if not os.path.exists(tmpath):
            os.mkdir(tmpath)
        millis = int(round(time.time()))
        try:
            tmpath = os.path.join(tmpath, str(Utils.epochtime_to_memento(millis)) + self.__constants.TM_EXT)
            with open(tmpath, "w") as tm_ofile:
                tm_ofile.write(tm_content)
            return True
        except Exception as e:
            sys.stderr.write("TimeMap Write Error: " + str(e) + "\n")
        return False
 def parse_mementos(self, turl):
     response = Utils.get_timerange(self.__constants, self.__conf_reader)
     lurims = Utils.parse_timemap(self.__dmanager, self.__constants, turl,
                                  self.__conf_reader, response["mintime"],
                                  response["maxtime"])
     if lurims:
         for urim in lurims:
             response = Utils.get_murl_info(urim, self.__thandle)
             if self.__conf_reader.debug:
                 sys.stdout.write("parse_mementos: " + str(response) + "\n")
             # If archive.is mementos then skip it, as we do not parse them
             if response["archive"] not in ["archive.is", "archive.md"]:
                 mcontent = self.__dmanager.read_memento(urim)
                 if mcontent is None:
                     if self.__conf_reader.debug:
                         sys.stdout.write(
                             "parse_mementos: read_memento:  " + str(urim) +
                             "   " + str(mcontent) + "\n")
                 else:
                     if self.__conf_reader.debug:
                         sys.stdout.write(
                             "parse_mementos: read_memento:  " + str(urim) +
                             "   True" + "\n")
                     self.__parse_memento(mcontent, urim)
     return self.__lfollower
    def read_timemap(self, turl=None):
        """
        This function is for reading TimeMap.

        Parameters:
            turl (str): Twitter URL

        Returns:
            (list): Content on Success and None on Failure
        """
        if self.lookup_timemap(turl):
            try:
                tmpath = self.__timemap_dir
                tresponse = Utils.get_turl_info(turl)
                tmpath = os.path.join(tmpath, tresponse["handle"].lower())
                tmpath = os.path.join(tmpath, tresponse["domain"], tresponse["wrep"] + tresponse["lang"])
                urims = []
                for time_map in os.listdir(tmpath):
                    with open(os.path.join(tmpath, time_map), "r") as tm_ofile:
                        for line in tm_ofile:
                            if not (line.startswith("@") or line.startswith("!")):
                                if line not in urims:
                                    urims.append(line)
                return urims
            except Exception as e:
                sys.stderr.write("TimeMap Read Error: " + str(e) + "\n")
        return None
    def read_memento(self, murl=None):
        """
        This function is for reading memento content.

        Parameters:
            murl (str):URI-M

        Returns:
            (str): Content on Success and None on Failure
        """
        mpath = self.lookup_memento(murl)
        response = Utils.get_murl_info(murl, self.__thandle)
        if mpath:
            if self.__constants.WARC_EXT in mpath:
                try:
                    with open(mpath, 'rb') as stream:
                        for record in ArchiveIterator(stream):
                            if record.rec_type == 'response':
                                if self.__config.debug: sys.stdout.write(str(murl["uri"]) + " Content Size: " + str(record.rec_headers.get_header('Content-Length')) + "\n")
                                if (int(response["timestamp"]) < 20090101000000 and int(record.rec_headers.get_header('Content-Length')) < 1000) or (int(response["timestamp"]) > 20200101000000 and int(record.rec_headers.get_header('Content-Length')) < 100000):
                                    return None
                                else:
                                    return record.content_stream().read()


                except Exception as e:
                    sys.stderr.write("Memento Read Error: " + str(e) + "\n")
            elif ".html" in mpath:
                try:
                    with open(mpath, "r") as stream:
                        return stream.read()
                except Exception as e:
                    sys.stderr.write("Memento Read Error: " + str(e) + "\n")
        return None
 def __parse_case1(self, soup):
     if self.__conf_reader.debug: sys.stdout.write("__parse_case1" + "\n")
     follower_tags = soup.select(
         "li.ProfileNav-item.ProfileNav-item--followers")
     for tags in follower_tags:
         if self.__conf_reader.debug: sys.stdout.write(str(tags) + "\n")
         fcount_temp = None
         if tags.select("span.ProfileNav-value")[0].has_attr("data-count"):
             fcount = tags.select("span.ProfileNav-value")[0]["data-count"]
         else:
             fcount = tags.select(
                 "a.ProfileNav-stat.ProfileNav-stat--link.u-borderUserColor"
             )
             if self.__conf_reader.debug:
                 sys.stdout.write(str(fcount) + "\n")
             if fcount:
                 fcount = re.sub("\D", '', fcount[0]["title"])
             else:
                 fcount_temp = tags.select("span.ProfileNav-value")[0].text
                 fcount = re.sub("\D", '', fcount_temp)
     if self.__conf_reader.debug:
         sys.stdout.write("Follower Count: {}".format(fcount) + "\n")
     tcount = Utils.convert_digits_to_english(fcount)
     if fcount_temp is not None:
         if fcount_temp[-1] in ["k", "K", "ಸಾ"]:
             tcount = tcount * 1000
         elif fcount_temp[-1] in ["m", "M"]:
             tcount = tcount * 1000000
         elif fcount_temp in ["b", "B"]:
             tcount = tcount * 1000000000
     return tcount
    def write_memento(self, murl=None):
        """
        This is function to write memento in WARC format.

        Parameters:
            murl (str): URI-M

        Returns:
            (bool): True on Success and False on Failure
        """
        try:
            if self.lookup_memento(murl):
                return True
            else:
                response = Utils.get_murl_info(murl, self.__thandle)
                mpath = self.__memento_dir
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["handle"].lower())
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["domain"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["archive"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["wrep"] + response["lang"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                try:
                    mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT)
                    with open(mpath, "wb") as output:
                        writer = WARCWriter(output, gzip=True)
                        resp = requests.get(murl,
                                            headers={'Accept-Encoding': 'identity'},
                                            stream=True, timeout=120)

                        # get raw headers from urllib3
                        headers_list = resp.raw.headers.items()
                        http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1')
                        record = writer.create_warc_record(mpath, 'response',
                                                           payload=resp.raw,
                                                           http_headers=http_headers)
                        writer.write_record(record)
                    return True
                except requests.exceptions.TooManyRedirects as err:
                    sys.stderr.write(murl + "Too Many redirects" + "\n")
                except requests.exceptions.ConnectTimeout as err:
                    sys.stderr.write(murl + "Connection Timeout" + "\n")
                except Exception as e:
                    sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n")
        except Exception as e:
            sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n")
        return False
    def __parse_memento(self, mcontent, urim):
        murl = urim["uri"]
        soup = bs4.BeautifulSoup(mcontent, "html.parser")
        try:
            lselector = [
                "li.ProfileNav-item.ProfileNav-item--followers",
                "ul.user-stats.clearfix", "table.stats.js-mini-profile-stats",
                "ul.stats.js-mini-profile-stats", "div.stats", "div#section",
                "table.stats", "div#side"
            ]

            lfollower_tags = list(filter(lambda x: soup.select(x), lselector))
            if lfollower_tags:
                if self.__conf_reader.debug:
                    sys.stdout.write(
                        str(urim) + "  " + str(lfollower_tags[0]) + "\n")
                lfunctions = [
                    partial(self.__parse_case1, soup),
                    partial(self.__parse_case2, soup),
                    partial(self.__parse_case3, soup),
                    partial(self.__parse_case4, soup),
                    partial(self.__parse_case5, soup),
                    partial(self.__parse_case6, soup),
                    partial(self.__parse_case7, soup),
                    partial(self.__parse_case8, soup)
                ]
                x = lambda lfollower_tags, lfunctions, lselector: lfunctions[
                    lselector.index(lfollower_tags[0])]()
                tcount = x(lfollower_tags, lfunctions, lselector)
                if tcount:
                    if self.__conf_reader.debug:
                        sys.stdout.write(
                            "URIM: {} Converted: {}".format(murl, tcount) +
                            "\n")
                    response = Utils.get_murl_info(urim,
                                                   self.__thandle.lower())
                    self.__lfollower.append({
                        "MementoDatetime":
                        response["timestamp"],
                        "URIM":
                        murl,
                        "FollowerCount":
                        tcount
                    })
            else:
                with open(os.path.join(os.getcwd(), "NonParsedMementos.txt"), "a+") as \
                        ofile:
                    ofile.write("No selector found: " + murl + "\n")
        except Exception as e:
            sys.stderr.write(
                "parse_memento: URL: {}: Error: {}".format(murl, e) + "\n")
            with open(os.path.join(os.getcwd(), "NonParsedMementos.txt"), "a+") as \
                ofile:
                ofile.write("Error: " + murl + "\n")
Esempio n. 8
0
def test_parse_timemap(datamager_connection):
    dmanager, configreader, constants = datamager_connection(parameters=False)
    assert Utils.parse_timemap(
        dmanager, constants, "https://twitter.com/m_nsiddique"
    ) == [{
        'datetime':
        'Mon, 28 May 2018 23:54:45 GMT',
        'rel':
        'first memento',
        'uri':
        'https://web.archive.org/web/20180528235445/https://twitter.com/m_nsiddique'
    }, {
        'datetime':
        'Tue, 29 Oct 2019 18:25:06 GMT',
        'rel':
        'last memento',
        'uri':
        'https://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique'
    }]
    def lookup_timemap(self, turl=None):
        """
        This function looks up for TimeMap.

        Parameters:
            turl (str): Twitter URL

        Returns:
            (bool): True on Success and False on Failure
        """
        try:
            tmpath = self.__timemap_dir
            tresponse = Utils.get_turl_info(turl)
            tmpath = os.path.join(tmpath, tresponse["handle"].lower())
            tmpath = os.path.join(tmpath, tresponse["domain"], tresponse["wrep"] + tresponse["lang"])
            if os.path.exists(tmpath) and len(os.listdir(tmpath)) > 0:
                return True
            return False
        except Exception as e:
            self.stderr.write("LookUp TimeMap: " + str(turl) + "  " + str(e) + "\n")
Esempio n. 10
0
    def lookup_memento(self, murl=None):
        """
        This function looks up for mementos.

        Parameters:
            murl (str): URI-M

        Returns:
            (str): Path of Memento on Success and None on Failure
        """
        try:
            response = Utils.get_murl_info(murl, self.__thandle)
            mpath = os.path.join(self.__memento_dir, response["handle"].lower(), response["domain"], response["archive"],
                                 response["wrep"], response["lang"], response["timestamp"] + self.__constants.WARC_EXT)
            if os.path.exists(mpath) and os.stat(mpath).st_size > 0:
                return mpath
            else:
                mpath = os.path.join(self.__memento_dir, response["handle"].lower(), response["archive"],
                                     response["wrep"], response["lang"], response["timestamp"] + ".html")
                if os.path.exists(mpath):
                    return mpath
            return None
        except Exception as e:
            sys.stderr.write("Memento Lookup Error: " + str(murl) + "  " + str(e) + "\n")
Esempio n. 11
0
def test_get_turl_info():
    assert Utils.get_turl_info("https://twitter.com/m_nsiddique") == {
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'default',
        'wrep': ''
    }
    assert Utils.get_turl_info("https://twitter.com/m_nsiddique?lang=en") == {
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'en',
        'wrep': ''
    }
    assert Utils.get_turl_info(
        "https://twitter.com/m_nsiddique/with_replies") == {
            'domain': 'desktop',
            'handle': 'm_nsiddique',
            'lang': 'default',
            'wrep': 'with_replies_'
        }
    assert Utils.get_turl_info(
        "https://twitter.com/m_nsiddique/with_replies?lang=en") == {
            'domain': 'desktop',
            'handle': 'm_nsiddique',
            'lang': 'en',
            'wrep': 'with_replies_'
        }
    assert Utils.get_turl_info("https://twitter.com/m_nsiddique/status/1") == {
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'default',
        'wrep': ''
    }
    assert Utils.get_turl_info("http://twitter.com/m_nsiddique") == {
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'default',
        'wrep': ''
    }
Esempio n. 12
0
def test_epochtime_to_memento():
    assert Utils.epochtime_to_memento(1606565532) == "20201128121212"
    assert Utils.epochtime_to_memento(0) == "19700101000000"
Esempio n. 13
0
def test_malformed_memento_to_epochtime():
    assert Utils.memento_to_epochtime("20201128") == None
    assert Utils.memento_to_epochtime("abc") == None
Esempio n. 14
0
def test_get_timerange(datamager_connection):
    dmanager, configreader, constants = datamager_connection(parameters=False)
    assert Utils.get_timerange(constants, configreader) == {
        "mintime": 20060321120000,
        "maxtime": int(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    }
Esempio n. 15
0
def test_memento_to_epochtime():
    assert Utils.memento_to_epochtime("20201128121212") == 1606565532
    assert Utils.memento_to_epochtime("19700101000000") == 0
Esempio n. 16
0
def test_get_murl_info():
    assert Utils.get_murl_info(
        "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique"
    ) == {
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'TwitterURL': 'https://twitter.com/m_nsiddique',
        'lang': 'default',
        'wrep': '',
        'archive': 'web.archive.org',
        'timestamp': '20191029182506'
    }
    assert Utils.get_murl_info(
        "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique?lang=en"
    ) == {
        'archive': 'web.archive.org',
        'timestamp': '20191029182506',
        'TwitterURL': 'https://twitter.com/m_nsiddique?lang=en',
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'en',
        'wrep': ''
    }
    assert Utils.get_murl_info(
        "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique/with_replies"
    ) == {
        'archive': 'web.archive.org',
        'timestamp': '20191029182506',
        'TwitterURL': 'https://twitter.com/m_nsiddique/with_replies',
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'default',
        'wrep': 'with_replies_'
    }
    assert Utils.get_murl_info(
        "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique/with_replies?lang=en"
    ) == {
        'archive': 'web.archive.org',
        'timestamp': '20191029182506',
        'TwitterURL': 'https://twitter.com/m_nsiddique/with_replies?lang=en',
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'en',
        'wrep': 'with_replies_'
    }
    assert Utils.get_murl_info(
        "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique/status/1"
    ) == {
        'archive': 'web.archive.org',
        'timestamp': '20191029182506',
        'TwitterURL': 'https://twitter.com/m_nsiddique/status/1',
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'default',
        'wrep': ''
    }
    assert Utils.get_murl_info(
        "http://web.archive.org/web/20191029182506/http://twitter.com/m_nsiddique"
    ) == {
        'archive': 'web.archive.org',
        'timestamp': '20191029182506',
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'TwitterURL': 'http://twitter.com/m_nsiddique',
        'domain': 'desktop',
        'handle': 'm_nsiddique',
        'lang': 'default',
        'wrep': ''
    }
    def __parse_timemap(self):

        todo_frontier = []
        '''
        List to count mementos
        Index 0: Total Urls
        Index 1: Already downloaded mementos
        Index 2: To be downloaded mementos
        '''
        mcount = [0, 0, 0]
        timerange = Utils.get_timerange(self.__constants, self.__conf_reader)
        if self.__conf_reader.debug:
            sys.stdout.write(
                "__parse_timemap:  Minimum Live Timestamp: {} Maximum Live Timestamp: {}"
                .format(timerange["mintime"], timerange["maxtime"]) + "\n")
        timemap_content = Utils.parse_timemap(self.__dmanager,
                                              self.__constants, self.__turl,
                                              self.__conf_reader,
                                              timerange["mintime"],
                                              timerange["maxtime"])
        if self.__conf_reader.debug:
            sys.stdout.write("__parse_timemap: " + str(timemap_content) + "\n")
        for memento in timemap_content:
            response = Utils.get_murl_info(memento, self.__thandle)
            # If archive.is mementos then skip it, as we do not parse them
            # Added to remove wayback.archive.it
            if response["archive"] not in ["archive.is", "archive.md"]:
                if response["timestamp"].isdigit():
                    if timerange["mintime"] <= int(
                            response["timestamp"]) <= timerange["maxtime"]:
                        # Count total number of mementos for twitter handle within the time range
                        mcount[0] += 1
                        memento_present = self.__dmanager.lookup_memento(
                            memento)
                        if memento_present:
                            mcount[1] += 1
                        else:
                            mcount[2] += 1
                            frontier_present = False
                            for entry in todo_frontier:
                                if entry["archive"] == response["archive"]:
                                    frontier_present = True
                                    entry["urims"].append(memento["uri"])
                                    entry["count"] += 1
                                    break
                            if not frontier_present:
                                json_object = {
                                    "archive": response["archive"],
                                    "count": 1,
                                    "urims": [memento["uri"]]
                                }
                                todo_frontier.append(json_object)
        # Write logs for each user
        if self.__conf_reader.debug:
            sys.stdout.write("fetch_mementos: Twitter Handle: " +
                             self.__thandle + "\n")
        if self.__conf_reader.debug:
            sys.stdout.write(
                "fetch_mementos: Date-Time: " +
                str(time.strftime("%b %d %Y %H:%M:%S", time.gmtime())) + "\n")
        if self.__conf_reader.debug:
            sys.stdout.write("fetch_mementos: Total Memento URLs: " +
                             str(mcount[0]) + "\n")
        if self.__conf_reader.debug:
            sys.stdout.write(
                "fetch_mementos: Number of Mementos already downloaded: " +
                str(mcount[1]) + "\n")
        if self.__conf_reader.debug:
            sys.stdout.write(
                "fetch_mementos: Number of Mementos for consideration: " +
                str(mcount[2]) + "\n")
        return todo_frontier
Esempio n. 18
0
def test_malformed_epochtime_to_memento():
    assert Utils.epochtime_to_memento(-1) == "19691231235959"
    assert Utils.epochtime_to_memento("123") == None