def read_list_urls(list_urls_file): categories_medias_urls = defaultdict(lambda: defaultdict(set)) with open(list_urls_file, "r") as f: for row in csv.DictReader(f): url = normalize_url(row["clean_url"].strip()) categories_medias_urls[row["niv0"]][row["webentity"]].add(url) return categories_medias_urls
def test_basics(self): for url, normalized in TESTS: assert normalize_url(url) == normalized, url assert normalize_url('lemonde.fr/index/', strip_trailing_slash=True) == 'lemonde.fr' assert normalize_url('https://[email protected]', strip_authentication=False) == '*****@*****.**' assert normalize_url( 'https://www.lemonde.fr', strip_protocol=False, strip_irrelevant_subdomain=False) == 'https://www.lemonde.fr' assert normalize_url( 'www.lemonde.fr', strip_protocol=False, strip_irrelevant_subdomain=False) == 'www.lemonde.fr'
def filter_and_enrich_tweets_from_csv(f, cat_urls, of=sys.stdout, total=None): categories = list(cat_urls.keys()) casa = casanova.enricher(f, of, add=["matched_urls", "webentities"] + categories) links_pos = casa.pos.links try: for row in tqdm(casa, total=total): links = [normalize_url(u) for u in row[links_pos].split('|')] if not links: continue matched_urls = [] webentities = set() cat_belongings = [] for cat in categories: cat_match = False for we, urls in cat_urls[cat].items(): for u in links: if u in urls: cat_match = True matched_urls.append(u) webentities.add(we) links.remove(u) cat_belongings.append(cat_match) if webentities: casa.writerow(row, ["|".join(matched_urls), "|".join(webentities)] + cat_belongings) except Exception as e: print("ERROR while processing", row, file=sys.stderr) raise (e)
def associate_urls_medias(csviterator, csv_field, trie): print("url,media") for row in csviterator: link = row.get(csv_field, None) if not link: continue url = normalize_url(link, strip_irrelevant_subdomain=False, strip_protocol=False) media = trie.longest(link) or "" print('"%s","%s"' % (link.replace('"', '""'), media.replace('"', '""')))
def extract_media_urls(db, trie): print("url,media") done = set() for tweet in db.find({"langs": "fr"}, projection=["links", "proper_links"]): for link in tweet.get("proper_links", tweet["links"]): link = normalize_url(link, strip_irrelevant_subdomains=False, strip_protocol=False) if link in done: continue done.add(link) media = trie.longest(link) if media: print('"%s","%s"' % (link.replace('"', '""'), media.replace('"', '""')))
def extract_standard_addendum(cli_args, url): inferred_redirection = infer_redirection(url) return [ normalize_url(url, strip_protocol=cli_args.strip_protocol, strip_trailing_slash=True), inferred_redirection if inferred_redirection != url else '', get_domain_name(url), get_hostname(url), get_normalized_hostname(url), 'yes' if is_shortened_url(url) else '' ]
def read_urls_types(list_urls_file): urls_types = {} codes = set() categories = set() with open(list_urls_file, "r") as f: reader = csv.DictReader(f) for row in reader: codes.add(row["code"]) categories.add(row["block"]) url = normalize_url(row["clean_url"].strip(), strip_trailing_slash=True) urls_types[url] = [row["code"], row["block"], row["webentity"]] return urls_types, list(codes), list(categories)
def clean_data(url_df, fact_check_df, SCIENTIFIC_TOPIC): """Clean and merge the appearance data""" # Remove the spaces added by error arount the URLs url_df['url'] = url_df['url'].transform(lambda x: x.strip()) # Filter the URLs to keep only the ones flagged as False or equivalent: url_df = url_df[(url_df['Flag as'].isin( ['False', 'Partly false', 'Misleading', 'False headline']))] # Use a REGEX to get the article field from the fact-check url website: # if the fact-check url starts with 'https://climatefeedback.org' -> 'climate' article # if the fact-check url starts with 'https://healthfeedback.org' -> 'health' article fact_check_df['field'] = fact_check_df['Review url'].str.extract( 'https://([^/]+)feedback.org') # Merge the two dataframes to get the 'field' for each url: url_df = url_df.dropna(subset=['Item reviewed']) fact_check_df = fact_check_df.dropna(subset=['Items reviewed']) url_df = url_df.merge(fact_check_df[['Items reviewed', 'field', 'topic']], left_on='Item reviewed', right_on='Items reviewed', how='left') # Keep only the URL about the scientific topic of interest: url_df.loc[url_df['topic'] == 'COVID-19', 'field'] = 'COVID-19' url_df = url_df.dropna(subset=['field']) url_df = url_df[url_df['field'] == SCIENTIFIC_TOPIC] # Clean the URLs and extract its domain name: url_df['url'] = url_df['url'].apply(lambda x: ural.normalize_url( x, strip_protocol=False, strip_trailing_slash=True)) url_df['domain_name'] = url_df['url'].apply( lambda x: ural.get_domain_name(x)) # Remove the URLs that are in double in the dataframe, # keeping only the first, i.e. the more recent ocurrence. url_df = url_df.drop_duplicates(subset="url", keep="first") # # Remove the plateforms from the analysis: # plateforms = ["facebook.com", "youtube.com", "twitter.com", "wordpress.com", "instagram.com"] # url_df = url_df[~url_df['domain_name'].isin(plateforms)] # # Remove the url with parameters from the analysis because CT return wrong results for them: # url_df['parameter_in_url'] = url_df['url'].apply(lambda x: '?' in x) # url_df = url_df[url_df['parameter_in_url']==False] url_df = url_df[['url', 'Item reviewed', 'field', 'domain_name']] return url_df
def extract_users_urls_medias_from_csv(f, trie, of=sys.stdout, total=None, filter_fr=False, min_date=None): headers = [ 'tweet_id', 'user_screen_name', 'user_id', 'normalized_url', 'domain_name', 'webentity', 'datetime', 'is_retweet', 'nb_followers' ] writer = csv.writer(of) writer.writerow(headers) casa = casanova.reader(f) try: for row, (tid, uname, uid, dtime, rtid, nbfols, links, lang) in tqdm(enumerate( casa.cells([ 'id', 'from_user_name', 'from_user_id', 'created_at', 'retweeted_id', 'from_user_followercount', 'links', 'lang' ])), total=total): if filter_fr and lang != 'fr': continue if min_date and dtime < min_date: continue is_rt = (rtid != '') for url in links.split('|'): url = url.strip() if not url: continue webentity = trie.match(url) normalized = normalize_url(url) domain = normalized.split("/")[0] if not webentity: #if "twitter.com/" not in url and "youtube.com" not in url: # print('WARNING: url unmatched on row #%s: %s' % (row, domain), file=sys.stderr) continue writer.writerow([ tid, uname, uid, normalized, domain, webentity, dtime, is_rt, nbfols ]) except Exception as e: print( 'ERROR while processing row #%s (https://twitter.com/%s/statuses/%s)' % (row, uname, tid), file=sys.stderr) raise (e)
def clean_url_format(url_df): url_df['url'] = url_df['url'].transform(lambda x: x.strip()) url_df['url_cleaned'] = url_df['url']\ .apply(lambda x: ural.normalize_url(x, strip_protocol=False, strip_trailing_slash=True)) url_df['domain_name'] = url_df['url_cleaned'].apply( lambda x: ural.get_domain_name(x)) # Remove the URLs that are in double in the dataframe, # keeping only the first, i.e. the more recent ocurrence. url_df = url_df.drop_duplicates(subset="url", keep="first") return url_df
def normalize_action(namespace): sort_query = not namespace.no_query_sort strip_authentication = not namespace.keep_authentication strip_trailing_slash = namespace.strip_trailing_slash strip_index = not namespace.keep_index headers, position, reader = custom_reader(namespace.file, namespace.column) headers.append(namespace.column + "_normalized") writer = csv.writer(namespace.output) writer.writerow(headers) for line in reader: url = line[position] line.append(normalize_url(url, sort_query=sort_query, strip_authentication=strip_authentication, strip_trailing_slash=strip_trailing_slash, strip_index=strip_index)) writer.writerow(line)
def filter_and_enrich_tweets_from_csv(f, cat_urls, codes, categories, of=sys.stdout, total=None): add_fields = ["matched_urls", "webentities"] + categories + codes casa = casanova.enricher(f, of, add=add_fields) links_pos = casa.pos.links len_row = len(casa.fieldnames) - casa.added_count add_pos = {field: i for i, field in enumerate(add_fields)} try: for row in tqdm(casa, total=total): links = [ normalize_url(u.strip(), strip_trailing_slash=True) for u in row[links_pos].split('|') ] if not links: continue webentities = set() matched_urls = set() add_row = ['', ''] + [False for i in categories ] + [False for j in codes] for u in links: infos = cat_urls.get(u, None) if infos: matched_urls.add(u) add_row[add_pos[infos[0]]] = True add_row[add_pos[infos[1]]] = True webentities.add(infos[2]) add_row[add_pos["webentities"]] = "|".join(webentities) add_row[add_pos["matched_urls"]] = "|".join(matched_urls) if matched_urls: casa.writerow(row, add_row) except Exception as e: print("ERROR while processing", row, file=sys.stderr) raise (e)
def url_parse_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm(desc='Parsing', dynamic_ncols=True, unit=' rows', total=namespace.total) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() loading_bar.update() if namespace.separator: urls = url.split(namespace.separator) else: urls = [url] for url in urls: if not is_url(url, allow_spaces_in_path=True): enricher.writerow(row) continue enricher.writerow(row, [ normalize_url(url, strip_protocol=namespace.strip_protocol, strip_trailing_slash=True), get_domain_name(url), get_hostname(url), get_normalized_hostname(url) ]) output_file.close()
def filter_and_enrich_tweets_from_mongo(db, cat_urls, of=sys.stdout): categories = list(cat_urls.keys()) fields = "id,time,created_at,from_user_name,text,filter_level,possibly_sensitive,withheld_copyright,withheld_scope,withheld_countries,truncated,retweet_count,favorite_count,reply_count,lang,to_user_name,to_user_id,in_reply_to_status_id,source,source_name,source_url,location,lat,lng,from_user_id,from_user_realname,from_user_verified,from_user_description,from_user_url,from_user_profile_image_url,from_user_utcoffset,from_user_timezone,from_user_lang,from_user_tweetcount,from_user_followercount,from_user_friendcount,from_user_favourites_count,from_user_listed,from_user_withheld_scope,from_user_withheld_countries,from_user_created_at,collected_via_search,collected_via_stream,collected_via_thread_only,collected_at_timestamp,retweeted_id,retweeted_user_name,retweeted_user_id,quoted_id,quoted_user_name,quoted_user_id,links,medias_urls,medias_files,mentioned_user_names,mentioned_user_ids,hashtags".split( ",") headers = fields + ["matched_urls", "webentities"] + categories writer = csv.DictWriter(of, fieldnames=headers, extrasaction="ignore") writer.writeheader() for t in tqdm(db.find(), total=db.count()): if len(t.keys()) < 10: continue for f in fields: t[f] = get_field(f, t) try: links = [normalize_url(u) for u in t["links"].split('|')] if not links: continue t["matched_urls"] = [] t["webentities"] = set() for cat in categories: cat_match = False for we, urls in cat_urls[cat].items(): for u in links: if u in urls: cat_match = True t["matched_urls"].append(u) t["webentities"].add(we) links.remove(u) t[cat] = cat_match if t["webentities"]: t["matched_urls"] = "|".join(t["matched_urls"]) t["webentities"] = "|".join(t["webentities"]) writer.writerow(t) except Exception as e: print("ERROR while processing", t, file=sys.stderr) raise (e)
with open(CORPUS) as f, open(OUTPUT, 'w') as wf: reader = csv.DictReader(f) writer = csv.DictWriter(wf, fieldnames=reader.fieldnames + ['twitter_search']) writer.writeheader() for line in reader: batch = line['batch (TAGS)'] if batch == 'EU': continue prefixes = LRUTrie(**NORMALIZE_KWARGS) for prefix in line['PREFIXES AS URL'].split(' '): prefixes.set(prefix, prefix) matching_prefix = prefixes.match(line['HOME PAGE']) if matching_prefix is None: print() print('Alaaaarm!', line) print() search_keyword = normalize_url(matching_prefix, **NORMALIZE_KWARGS) print(line['NAME'], '=>', search_keyword) line['twitter_search'] = search_keyword writer.writerow(line)
with open('config.json') as confile: conf = json.loads(confile.read()) db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets'] urls = defaultdict(int) query = {} #query["langs"] = "fr" print "Counting matching results..." count = db.count(query) print "Querying and hashing results..." bar = progressbar.ProgressBar(max_value=count) for t in bar( db.find(query, limit=count, projection={ "links": 1, "proper_links": 1 })): for l in t.get("proper_links", t["links"]): d = normalize_url(l.split("/")[2]) urls[d] += 1 print "Sorting and storing csv data..." with open("shared_domains.csv", "w") as f: print >> f, "domain,shares" bar = progressbar.ProgressBar(max_value=len(urls)) for link, shares in bar(sorted(urls.items(), key=lambda x: -x[1])): print >> f, '%s,%s' % (format_csv(link), shares)
# TODO: improve heuristics with open(SHARES, 'r') as f: reader = csv.reader(f) next(reader) bar = ProgressBar() for line in bar(reader): urls = line[1].split('|') for url in urls: url = normalize_url(url) # Dropping homepages if '/' not in url: continue # Black lists if url in URL_BLACKLIST: continue if any(token in url for token in TOKENS_BLACKLIST): continue if any(url.endswith(token) for token in FINAL_TOKENS_BLACKLIST): continue
with open('config.json') as confile: conf = json.loads(confile.read()) db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets'] urls = defaultdict(int) query = {} print "Counting matching results..." count = db.count(query) print "Building and storing csv data..." with open("users_urls_domains.csv", "w") as f: print >> f, "user_screenname,link,domain,datetime,is_retweet" bar = progressbar.ProgressBar(max_value=count) for t in bar(db.find(query, limit=count, projection={"user_screen_name": 1, "links": 1, "proper_links": 1, "retweet_id": 1, "created_at": 1})): links = t.get("proper_links", t["links"]) if not links: continue name = t.get("user_screen_name") isRT = 1 if t["retweet_id"] else 0 dtime = isodate(t["created_at"]) for l in links: lnk = normalize_url(l) try: domain = normalize_url(l.split("/")[2]) except Exception as e: print >> sys.stderr, "ERROR normalizing domain for url", l, type(e), e domain = "" print >> f, ",".join([format_csv(v) for v in [name, lnk, domain, dtime, str(isRT)]])
if LIMIT is not None and next(count) > LIMIT: bar.finish() break user = line['user_screenname'] user_id = USER_IDS[user] links = line['links'].split('|') for link in links: media = MEDIAS_TRIE.longest(link) if media: USER_VECTORS[media[NAME_FIELD]][user_id] += 1 norm_link = normalize_url(link) MEDIAS_URLS[media[NAME_FIELD]].add(norm_link) writer.writerow({ 'user': user, 'media': media[NAME_FIELD], 'normalized_url': norm_link }) MEDIAS = list(set([media[NAME_FIELD] for media in MEDIAS_TRIE.values])) print('Found %i unique users.' % len(USER_IDS)) print('Found %i unique medias.' % len(MEDIAS)) print('Computing media norms...')
def parse_youtube_url(url): url = clean_url(url) u = normalize_url(url, strip_lang_subdomains=True, strip_trailing_slash=True) parsed = urlparse(url) # URL pattern youtu.be/VIDEO_ID if parsed.netloc == 'youtu.be': if "/" not in u: return "home", None url_id = u.split("/")[1] url_id = u.split("?")[0] url_id = u.split("%")[0] return "video", url_id # URL pattern youtube.googleapis.com/v/VIDEO_ID if parsed.netloc == 'youtube.googleapis.com': if "/v/" in u: url_id = u.split("/")[2] else: raise (Exception("Wrong url format %s" % u)) return "video", url_id if parsed.netloc in [ 'img.youtube.com', 'gaming.youtube.com', 'music.youtube.com', 'studio.youtube.com' ]: return "irrelevant", None if parsed.netloc.endswith('youtube.com'): if u in ["youtube.com"] and not parsed.fragment: return "home", None stem0 = parsed.path.split("/")[1] stem1 = parsed.path.split("/")[2] if "/" in parsed.path.lstrip( "/") else None queryargs = parsed.query.split("&") if stem0 in [ "t", "yt", "results", "playlist", "artist", "channels", "audiolibrary", "feed", "intl", "musicpremium", "premium", "show", "watch_videos", "comment", "creators", "profile_redirector", "static", "view_play_list", "index" ]: return "irrelevant", None # URL pattern youtube.com/channel/CHANNEL_ID if stem0 == "channel": return "channel", stem1 # URL pattern youtube.com/user/USER_ID if stem0 in ["user", "c"]: return "user", stem1 # URL pattern youtube.com/profile_videos?user=USER_ID if stem0 == "attribution_link": uarg = [arg for arg in queryargs if arg.startswith("u=")] if len(uarg): return parse_youtube_url("http://youtube.com" + unquote(uarg[0].split("=")[1])) if stem0 in ["profile_videos", "subscription_center"]: uarg = [ arg for arg in queryargs if arg.startswith("user="******"add_user="******"user", uarg[0].split("=")[1] # URL pattern youtube.com/v/VIDEO_ID if stem0 in ["v", "embed", "video"]: return "video", stem1 # URL pattern youtube.com/watch?v=VIDEO_ID if stem0 in [ "watch", "redirect", "comment_servlet", "all_comments", "watch_popup" ]: varg = [arg for arg in queryargs if arg.startswith("v=")] if len(varg): return "video", varg[0].split("=")[1] return "video", None if stem0 in ["edit", "swf"]: varg = [arg for arg in queryargs if arg.startswith("video_id=")] if len(varg): return "video", varg[0].split("=")[1] return "video", None # URL pattern youtube.com/#%2Fwatch%3Fv%3DVIDEO_ID if "v%3D" in parsed.query: fquery = unquote(parsed.query) queryargs = fquery.split("?")[1].split("&") varg = [arg for arg in queryargs if arg.startswith("v=")] if len(varg): return "video", varg[0].split("=")[1] if "v%3D" in parsed.fragment: fquery = unquote(parsed.fragment) queryargs = fquery.split("?")[1].split("&") varg = [arg for arg in queryargs if arg.startswith("v=")] if len(varg): return "video", varg[0].split("=")[1] if "continue=" in parsed.query: urlarg = [arg for arg in queryargs if arg.startswith("continue=")][0].split("=")[1] return parse_youtube_url(unquote(urlarg)) if not stem1 and (not parsed.query or parsed.query in ["sub_confirmation=1"]) and not parsed.fragment: return "user", stem0 return "error", None
from urllib.parse import urlsplit, parse_qsl from collections import Counter from tqdm import tqdm from ural import normalize_url TOP = 50 FRAGMENTS = Counter() QUERIES = Counter() QUERIES_COMBO = Counter() with open('./scripts/data/urls.csv') as f: for line in tqdm(f, desc='Reading urls'): url = line.strip()[1:-1] url = normalize_url(url, strip_protocol=False) parsed = urlsplit(url) FRAGMENTS[parsed.fragment] += 1 if parsed.query: for name, value in parse_qsl(parsed.query): QUERIES[name] += 1 QUERIES_COMBO['%s=%s' % (name, value)] += 1 def report(name, counter): print() title = 'Top %i %s:' % (TOP, name) print(title)
parser.add_argument('sources', help='CSV files of sources', type=FileType('r')) parser.add_argument('target', help='Target CSV file', type=FileType('r')) parser.add_argument('-o', '--output', help='output file', type=FileType('w'), default=sys.stdout) args = parser.parse_args() # Indexing trie = LRUTrie(strip_trailing_slash=True) index = {} for line in csv.DictReader(args.sources): trie.set(normalize_url(line['url'], strip_trailing_slash=True), line) index[line['mediacloud_id']] = line reader = csv.DictReader(args.target) writer = csv.DictWriter(args.output, fieldnames=reader.fieldnames + ['polarisation_id', 'polarisation_name', 'webentity']) writer.writeheader() for line in reader: url = line['url'] addendum = { 'polarisation_id': '', 'polarisation_name': '', 'webentity': ''
"proper_links": 1, "retweet_id": 1, "created_at": 1, "user_followers": 1 })): links = t.get("proper_links", t["links"]) if not links: continue name = t.get("user_screen_name") uid = t.get("user_id_str") dtime = isodate(t["created_at"]) isRT = str(1 if t["retweet_id"] else 0) fols = str(t["user_followers"]) for l in links: try: lnk = normalize_url(l.decode("utf-8")) except Exception as e: print >> sys.stderr, "ERROR: url misformatted", l, type(e), e lnk = l try: domain = normalize_url(l.split("/")[2]) except Exception as e: print >> sys.stderr, "ERROR: normalizing domain for url", l, type( e), e domain = "" try: media = trie.match(l) or "" except Exception as e: print >> sys.stderr, "ERROR: LRUtrie matching crashes for url", l, type( e), e media = ""
import csv from ural import normalize_url with open('./scripts/data/tricky.csv') as f: reader = csv.DictReader(f) for line in reader: if not line['expanded_links']: continue for url in line['expanded_links'].split('|'): try: normalize_url(url) except Exception as e: print(e, url)
from collections import defaultdict from pymongo import MongoClient from ural import normalize_url from gazouilloire.web.export import format_csv with open('config.json') as confile: conf = json.loads(confile.read()) db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets'] urls = defaultdict(int) query = {} #query["langs"] = "fr" print "Counting matching results..." count = db.count(query) print "Querying and hashing results..." bar = progressbar.ProgressBar(max_value=count) for t in bar(db.find(query, limit=count, projection={"links": 1, "proper_links": 1})): for l in t.get("proper_links", t["links"]): d = normalize_url(l.split("/")[2]) urls[d] += 1 print "Sorting and storing csv data..." with open("shared_domains.csv", "w") as f: print >> f, "domain,shares" bar = progressbar.ProgressBar(max_value=len(urls)) for link, shares in bar(sorted(urls.items(), key = lambda x: -x[1])): print >> f, '%s,%s' % (format_csv(link), shares)
MEDIAS = {} for media in trie.values: if not media['mediacloud_id']: continue MEDIAS[int(media['mediacloud_id'])] = media # Indexing urls with open(SHARED_URLS_FILE, 'r') as f: reader = csv.reader(f) next(reader) bar = ProgressBar() for line in bar(reader): DEDUPED_URLS[normalize_url(line[0])] = int(line[1]) bar.finish() # Retrieving mediacloud urls with open(OUTPUT, 'w') as f: writer = csv.DictWriter(f, fieldnames=['query', 'id', 'date', 'url', 'normalized', 'title', 'media', 'mediacloud_id', 'shares']) writer.writeheader() for query in QUERIES: print('Query "%s"' % query) nb_batches = 0 last = 0 # TODO: add mediacloud_id + match by id rather
def test_normalize_url(self): for url, normalized in TESTS: assert normalize_url(url) == normalized, url for url, normalized, kwargs in TESTS_ADVANCED: assert normalize_url(url, **kwargs) == normalized, url
# TODO: output them print('Could not match', line['NAME'], line['STATUS']) else: NAME_INDEX[custom_fingerprint(line['NAME'])] = match NGRAMS_NAME_INDEX[custom_ngrams_fingerprint(line['NAME'])] = record # Warning for entities having dubious home pages home_page = line['HOME PAGE'] if 'twitter' in home_page or 'facebook' in home_page: print('WARNING: %s has a dubious home page %s' % (line['NAME'], home_page)) # Printing report unique_prefixes = set( normalize_url(prefix, strip_trailing_slash=True) for prefix in prefixes if not ('twitter' in prefix or 'facebook' in prefix or 'google' in prefix or 'pinterest' in prefix)) if len(unique_prefixes) < 2: continue p('') p('## %s' % line['NAME']) p('') for prefix in sorted(unique_prefixes, key=lambda p: len(p.split('/'))): p('* %s' % prefix) # Sanity check
with open("users_urls_domains.csv", "w") as f: print >> f, "user_screenname,link,domain,datetime,is_retweet" bar = progressbar.ProgressBar(max_value=count) for t in bar( db.find(query, limit=count, projection={ "user_screen_name": 1, "links": 1, "proper_links": 1, "retweet_id": 1, "created_at": 1 })): links = t.get("proper_links", t["links"]) if not links: continue name = t.get("user_screen_name") isRT = 1 if t["retweet_id"] else 0 dtime = isodate(t["created_at"]) for l in links: lnk = normalize_url(l) try: domain = normalize_url(l.split("/")[2]) except Exception as e: print >> sys.stderr, "ERROR normalizing domain for url", l, type( e), e domain = "" print >> f, ",".join( [format_csv(v) for v in [name, lnk, domain, dtime, str(isRT)]])