def write_timemap(self, turl=None, tm_content=None): """ This is function to write TimeMap. Parameters: turl (str): Twitter URL tm_content (str): TimeMap Content Returns: (bool): True on Success and False on Failure """ tresponse = Utils.get_turl_info(turl) tmpath = self.__timemap_dir if not os.path.exists(tmpath): os.mkdir(tmpath) tmpath = os.path.join(tmpath, tresponse["handle"].lower()) if not os.path.exists(tmpath): os.mkdir(tmpath) tmpath = os.path.join(tmpath, tresponse["domain"]) if not os.path.exists(tmpath): os.mkdir(tmpath) tmpath = os.path.join(tmpath, tresponse["wrep"] + tresponse["lang"]) if not os.path.exists(tmpath): os.mkdir(tmpath) millis = int(round(time.time())) try: tmpath = os.path.join(tmpath, str(Utils.epochtime_to_memento(millis)) + self.__constants.TM_EXT) with open(tmpath, "w") as tm_ofile: tm_ofile.write(tm_content) return True except Exception as e: sys.stderr.write("TimeMap Write Error: " + str(e) + "\n") return False
def parse_mementos(self, turl): response = Utils.get_timerange(self.__constants, self.__conf_reader) lurims = Utils.parse_timemap(self.__dmanager, self.__constants, turl, self.__conf_reader, response["mintime"], response["maxtime"]) if lurims: for urim in lurims: response = Utils.get_murl_info(urim, self.__thandle) if self.__conf_reader.debug: sys.stdout.write("parse_mementos: " + str(response) + "\n") # If archive.is mementos then skip it, as we do not parse them if response["archive"] not in ["archive.is", "archive.md"]: mcontent = self.__dmanager.read_memento(urim) if mcontent is None: if self.__conf_reader.debug: sys.stdout.write( "parse_mementos: read_memento: " + str(urim) + " " + str(mcontent) + "\n") else: if self.__conf_reader.debug: sys.stdout.write( "parse_mementos: read_memento: " + str(urim) + " True" + "\n") self.__parse_memento(mcontent, urim) return self.__lfollower
def read_timemap(self, turl=None): """ This function is for reading TimeMap. Parameters: turl (str): Twitter URL Returns: (list): Content on Success and None on Failure """ if self.lookup_timemap(turl): try: tmpath = self.__timemap_dir tresponse = Utils.get_turl_info(turl) tmpath = os.path.join(tmpath, tresponse["handle"].lower()) tmpath = os.path.join(tmpath, tresponse["domain"], tresponse["wrep"] + tresponse["lang"]) urims = [] for time_map in os.listdir(tmpath): with open(os.path.join(tmpath, time_map), "r") as tm_ofile: for line in tm_ofile: if not (line.startswith("@") or line.startswith("!")): if line not in urims: urims.append(line) return urims except Exception as e: sys.stderr.write("TimeMap Read Error: " + str(e) + "\n") return None
def read_memento(self, murl=None): """ This function is for reading memento content. Parameters: murl (str):URI-M Returns: (str): Content on Success and None on Failure """ mpath = self.lookup_memento(murl) response = Utils.get_murl_info(murl, self.__thandle) if mpath: if self.__constants.WARC_EXT in mpath: try: with open(mpath, 'rb') as stream: for record in ArchiveIterator(stream): if record.rec_type == 'response': if self.__config.debug: sys.stdout.write(str(murl["uri"]) + " Content Size: " + str(record.rec_headers.get_header('Content-Length')) + "\n") if (int(response["timestamp"]) < 20090101000000 and int(record.rec_headers.get_header('Content-Length')) < 1000) or (int(response["timestamp"]) > 20200101000000 and int(record.rec_headers.get_header('Content-Length')) < 100000): return None else: return record.content_stream().read() except Exception as e: sys.stderr.write("Memento Read Error: " + str(e) + "\n") elif ".html" in mpath: try: with open(mpath, "r") as stream: return stream.read() except Exception as e: sys.stderr.write("Memento Read Error: " + str(e) + "\n") return None
def __parse_case1(self, soup): if self.__conf_reader.debug: sys.stdout.write("__parse_case1" + "\n") follower_tags = soup.select( "li.ProfileNav-item.ProfileNav-item--followers") for tags in follower_tags: if self.__conf_reader.debug: sys.stdout.write(str(tags) + "\n") fcount_temp = None if tags.select("span.ProfileNav-value")[0].has_attr("data-count"): fcount = tags.select("span.ProfileNav-value")[0]["data-count"] else: fcount = tags.select( "a.ProfileNav-stat.ProfileNav-stat--link.u-borderUserColor" ) if self.__conf_reader.debug: sys.stdout.write(str(fcount) + "\n") if fcount: fcount = re.sub("\D", '', fcount[0]["title"]) else: fcount_temp = tags.select("span.ProfileNav-value")[0].text fcount = re.sub("\D", '', fcount_temp) if self.__conf_reader.debug: sys.stdout.write("Follower Count: {}".format(fcount) + "\n") tcount = Utils.convert_digits_to_english(fcount) if fcount_temp is not None: if fcount_temp[-1] in ["k", "K", "ಸಾ"]: tcount = tcount * 1000 elif fcount_temp[-1] in ["m", "M"]: tcount = tcount * 1000000 elif fcount_temp in ["b", "B"]: tcount = tcount * 1000000000 return tcount
def write_memento(self, murl=None): """ This is function to write memento in WARC format. Parameters: murl (str): URI-M Returns: (bool): True on Success and False on Failure """ try: if self.lookup_memento(murl): return True else: response = Utils.get_murl_info(murl, self.__thandle) mpath = self.__memento_dir if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["handle"].lower()) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["domain"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["archive"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["wrep"] + response["lang"]) if not os.path.exists(mpath): os.mkdir(mpath) try: mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT) with open(mpath, "wb") as output: writer = WARCWriter(output, gzip=True) resp = requests.get(murl, headers={'Accept-Encoding': 'identity'}, stream=True, timeout=120) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1') record = writer.create_warc_record(mpath, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record) return True except requests.exceptions.TooManyRedirects as err: sys.stderr.write(murl + "Too Many redirects" + "\n") except requests.exceptions.ConnectTimeout as err: sys.stderr.write(murl + "Connection Timeout" + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n") return False
def __parse_memento(self, mcontent, urim): murl = urim["uri"] soup = bs4.BeautifulSoup(mcontent, "html.parser") try: lselector = [ "li.ProfileNav-item.ProfileNav-item--followers", "ul.user-stats.clearfix", "table.stats.js-mini-profile-stats", "ul.stats.js-mini-profile-stats", "div.stats", "div#section", "table.stats", "div#side" ] lfollower_tags = list(filter(lambda x: soup.select(x), lselector)) if lfollower_tags: if self.__conf_reader.debug: sys.stdout.write( str(urim) + " " + str(lfollower_tags[0]) + "\n") lfunctions = [ partial(self.__parse_case1, soup), partial(self.__parse_case2, soup), partial(self.__parse_case3, soup), partial(self.__parse_case4, soup), partial(self.__parse_case5, soup), partial(self.__parse_case6, soup), partial(self.__parse_case7, soup), partial(self.__parse_case8, soup) ] x = lambda lfollower_tags, lfunctions, lselector: lfunctions[ lselector.index(lfollower_tags[0])]() tcount = x(lfollower_tags, lfunctions, lselector) if tcount: if self.__conf_reader.debug: sys.stdout.write( "URIM: {} Converted: {}".format(murl, tcount) + "\n") response = Utils.get_murl_info(urim, self.__thandle.lower()) self.__lfollower.append({ "MementoDatetime": response["timestamp"], "URIM": murl, "FollowerCount": tcount }) else: with open(os.path.join(os.getcwd(), "NonParsedMementos.txt"), "a+") as \ ofile: ofile.write("No selector found: " + murl + "\n") except Exception as e: sys.stderr.write( "parse_memento: URL: {}: Error: {}".format(murl, e) + "\n") with open(os.path.join(os.getcwd(), "NonParsedMementos.txt"), "a+") as \ ofile: ofile.write("Error: " + murl + "\n")
def test_parse_timemap(datamager_connection): dmanager, configreader, constants = datamager_connection(parameters=False) assert Utils.parse_timemap( dmanager, constants, "https://twitter.com/m_nsiddique" ) == [{ 'datetime': 'Mon, 28 May 2018 23:54:45 GMT', 'rel': 'first memento', 'uri': 'https://web.archive.org/web/20180528235445/https://twitter.com/m_nsiddique' }, { 'datetime': 'Tue, 29 Oct 2019 18:25:06 GMT', 'rel': 'last memento', 'uri': 'https://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique' }]
def lookup_timemap(self, turl=None): """ This function looks up for TimeMap. Parameters: turl (str): Twitter URL Returns: (bool): True on Success and False on Failure """ try: tmpath = self.__timemap_dir tresponse = Utils.get_turl_info(turl) tmpath = os.path.join(tmpath, tresponse["handle"].lower()) tmpath = os.path.join(tmpath, tresponse["domain"], tresponse["wrep"] + tresponse["lang"]) if os.path.exists(tmpath) and len(os.listdir(tmpath)) > 0: return True return False except Exception as e: self.stderr.write("LookUp TimeMap: " + str(turl) + " " + str(e) + "\n")
def lookup_memento(self, murl=None): """ This function looks up for mementos. Parameters: murl (str): URI-M Returns: (str): Path of Memento on Success and None on Failure """ try: response = Utils.get_murl_info(murl, self.__thandle) mpath = os.path.join(self.__memento_dir, response["handle"].lower(), response["domain"], response["archive"], response["wrep"], response["lang"], response["timestamp"] + self.__constants.WARC_EXT) if os.path.exists(mpath) and os.stat(mpath).st_size > 0: return mpath else: mpath = os.path.join(self.__memento_dir, response["handle"].lower(), response["archive"], response["wrep"], response["lang"], response["timestamp"] + ".html") if os.path.exists(mpath): return mpath return None except Exception as e: sys.stderr.write("Memento Lookup Error: " + str(murl) + " " + str(e) + "\n")
def test_get_turl_info(): assert Utils.get_turl_info("https://twitter.com/m_nsiddique") == { 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'default', 'wrep': '' } assert Utils.get_turl_info("https://twitter.com/m_nsiddique?lang=en") == { 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'en', 'wrep': '' } assert Utils.get_turl_info( "https://twitter.com/m_nsiddique/with_replies") == { 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'default', 'wrep': 'with_replies_' } assert Utils.get_turl_info( "https://twitter.com/m_nsiddique/with_replies?lang=en") == { 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'en', 'wrep': 'with_replies_' } assert Utils.get_turl_info("https://twitter.com/m_nsiddique/status/1") == { 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'default', 'wrep': '' } assert Utils.get_turl_info("http://twitter.com/m_nsiddique") == { 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'default', 'wrep': '' }
def test_epochtime_to_memento(): assert Utils.epochtime_to_memento(1606565532) == "20201128121212" assert Utils.epochtime_to_memento(0) == "19700101000000"
def test_malformed_memento_to_epochtime(): assert Utils.memento_to_epochtime("20201128") == None assert Utils.memento_to_epochtime("abc") == None
def test_get_timerange(datamager_connection): dmanager, configreader, constants = datamager_connection(parameters=False) assert Utils.get_timerange(constants, configreader) == { "mintime": 20060321120000, "maxtime": int(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) }
def test_memento_to_epochtime(): assert Utils.memento_to_epochtime("20201128121212") == 1606565532 assert Utils.memento_to_epochtime("19700101000000") == 0
def test_get_murl_info(): assert Utils.get_murl_info( "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique" ) == { 'domain': 'desktop', 'handle': 'm_nsiddique', 'TwitterURL': 'https://twitter.com/m_nsiddique', 'lang': 'default', 'wrep': '', 'archive': 'web.archive.org', 'timestamp': '20191029182506' } assert Utils.get_murl_info( "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique?lang=en" ) == { 'archive': 'web.archive.org', 'timestamp': '20191029182506', 'TwitterURL': 'https://twitter.com/m_nsiddique?lang=en', 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'en', 'wrep': '' } assert Utils.get_murl_info( "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique/with_replies" ) == { 'archive': 'web.archive.org', 'timestamp': '20191029182506', 'TwitterURL': 'https://twitter.com/m_nsiddique/with_replies', 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'default', 'wrep': 'with_replies_' } assert Utils.get_murl_info( "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique/with_replies?lang=en" ) == { 'archive': 'web.archive.org', 'timestamp': '20191029182506', 'TwitterURL': 'https://twitter.com/m_nsiddique/with_replies?lang=en', 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'en', 'wrep': 'with_replies_' } assert Utils.get_murl_info( "http://web.archive.org/web/20191029182506/https://twitter.com/m_nsiddique/status/1" ) == { 'archive': 'web.archive.org', 'timestamp': '20191029182506', 'TwitterURL': 'https://twitter.com/m_nsiddique/status/1', 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'default', 'wrep': '' } assert Utils.get_murl_info( "http://web.archive.org/web/20191029182506/http://twitter.com/m_nsiddique" ) == { 'archive': 'web.archive.org', 'timestamp': '20191029182506', 'domain': 'desktop', 'handle': 'm_nsiddique', 'TwitterURL': 'http://twitter.com/m_nsiddique', 'domain': 'desktop', 'handle': 'm_nsiddique', 'lang': 'default', 'wrep': '' }
def __parse_timemap(self): todo_frontier = [] ''' List to count mementos Index 0: Total Urls Index 1: Already downloaded mementos Index 2: To be downloaded mementos ''' mcount = [0, 0, 0] timerange = Utils.get_timerange(self.__constants, self.__conf_reader) if self.__conf_reader.debug: sys.stdout.write( "__parse_timemap: Minimum Live Timestamp: {} Maximum Live Timestamp: {}" .format(timerange["mintime"], timerange["maxtime"]) + "\n") timemap_content = Utils.parse_timemap(self.__dmanager, self.__constants, self.__turl, self.__conf_reader, timerange["mintime"], timerange["maxtime"]) if self.__conf_reader.debug: sys.stdout.write("__parse_timemap: " + str(timemap_content) + "\n") for memento in timemap_content: response = Utils.get_murl_info(memento, self.__thandle) # If archive.is mementos then skip it, as we do not parse them # Added to remove wayback.archive.it if response["archive"] not in ["archive.is", "archive.md"]: if response["timestamp"].isdigit(): if timerange["mintime"] <= int( response["timestamp"]) <= timerange["maxtime"]: # Count total number of mementos for twitter handle within the time range mcount[0] += 1 memento_present = self.__dmanager.lookup_memento( memento) if memento_present: mcount[1] += 1 else: mcount[2] += 1 frontier_present = False for entry in todo_frontier: if entry["archive"] == response["archive"]: frontier_present = True entry["urims"].append(memento["uri"]) entry["count"] += 1 break if not frontier_present: json_object = { "archive": response["archive"], "count": 1, "urims": [memento["uri"]] } todo_frontier.append(json_object) # Write logs for each user if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Twitter Handle: " + self.__thandle + "\n") if self.__conf_reader.debug: sys.stdout.write( "fetch_mementos: Date-Time: " + str(time.strftime("%b %d %Y %H:%M:%S", time.gmtime())) + "\n") if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Total Memento URLs: " + str(mcount[0]) + "\n") if self.__conf_reader.debug: sys.stdout.write( "fetch_mementos: Number of Mementos already downloaded: " + str(mcount[1]) + "\n") if self.__conf_reader.debug: sys.stdout.write( "fetch_mementos: Number of Mementos for consideration: " + str(mcount[2]) + "\n") return todo_frontier
def test_malformed_epochtime_to_memento(): assert Utils.epochtime_to_memento(-1) == "19691231235959" assert Utils.epochtime_to_memento("123") == None