def read_list_urls(list_urls_file):
    categories_medias_urls = defaultdict(lambda: defaultdict(set))
    with open(list_urls_file, "r") as f:
        for row in csv.DictReader(f):
            url = normalize_url(row["clean_url"].strip())
            categories_medias_urls[row["niv0"]][row["webentity"]].add(url)
    return categories_medias_urls
コード例 #2
0
ファイル: normalize_url_test.py プロジェクト: oubine/ural
    def test_basics(self):
        for url, normalized in TESTS:
            assert normalize_url(url) == normalized, url

        assert normalize_url('lemonde.fr/index/',
                             strip_trailing_slash=True) == 'lemonde.fr'
        assert normalize_url('https://[email protected]',
                             strip_authentication=False) == '*****@*****.**'
        assert normalize_url(
            'https://www.lemonde.fr',
            strip_protocol=False,
            strip_irrelevant_subdomain=False) == 'https://www.lemonde.fr'
        assert normalize_url(
            'www.lemonde.fr',
            strip_protocol=False,
            strip_irrelevant_subdomain=False) == 'www.lemonde.fr'
def filter_and_enrich_tweets_from_csv(f, cat_urls, of=sys.stdout, total=None):
    categories = list(cat_urls.keys())
    casa = casanova.enricher(f,
                             of,
                             add=["matched_urls", "webentities"] + categories)
    links_pos = casa.pos.links

    try:
        for row in tqdm(casa, total=total):
            links = [normalize_url(u) for u in row[links_pos].split('|')]
            if not links:
                continue

            matched_urls = []
            webentities = set()
            cat_belongings = []
            for cat in categories:
                cat_match = False
                for we, urls in cat_urls[cat].items():
                    for u in links:
                        if u in urls:
                            cat_match = True
                            matched_urls.append(u)
                            webentities.add(we)
                            links.remove(u)
                cat_belongings.append(cat_match)

            if webentities:
                casa.writerow(row,
                              ["|".join(matched_urls), "|".join(webentities)] +
                              cat_belongings)

    except Exception as e:
        print("ERROR while processing", row, file=sys.stderr)
        raise (e)
コード例 #4
0
def associate_urls_medias(csviterator, csv_field, trie):
    print("url,media")
    for row in csviterator:
        link = row.get(csv_field, None)
        if not link: continue
        url = normalize_url(link,
                            strip_irrelevant_subdomain=False,
                            strip_protocol=False)
        media = trie.longest(link) or ""
        print('"%s","%s"' %
              (link.replace('"', '""'), media.replace('"', '""')))
def extract_media_urls(db, trie):
    print("url,media")
    done = set()
    for tweet in db.find({"langs": "fr"}, projection=["links", "proper_links"]):
        for link in tweet.get("proper_links", tweet["links"]):
            link = normalize_url(link, strip_irrelevant_subdomains=False, strip_protocol=False)
            if link in done:
                continue
            done.add(link)
            media = trie.longest(link)
            if media:
                print('"%s","%s"' % (link.replace('"', '""'), media.replace('"', '""')))
コード例 #6
0
ファイル: url_parse.py プロジェクト: zanachka/minet
def extract_standard_addendum(cli_args, url):
    inferred_redirection = infer_redirection(url)

    return [
        normalize_url(url,
                      strip_protocol=cli_args.strip_protocol,
                      strip_trailing_slash=True),
        inferred_redirection if inferred_redirection != url else '',
        get_domain_name(url),
        get_hostname(url),
        get_normalized_hostname(url), 'yes' if is_shortened_url(url) else ''
    ]
def read_urls_types(list_urls_file):
    urls_types = {}
    codes = set()
    categories = set()
    with open(list_urls_file, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            codes.add(row["code"])
            categories.add(row["block"])
            url = normalize_url(row["clean_url"].strip(),
                                strip_trailing_slash=True)
            urls_types[url] = [row["code"], row["block"], row["webentity"]]
    return urls_types, list(codes), list(categories)
コード例 #8
0
def clean_data(url_df, fact_check_df, SCIENTIFIC_TOPIC):
    """Clean and merge the appearance data"""

    # Remove the spaces added by error arount the URLs
    url_df['url'] = url_df['url'].transform(lambda x: x.strip())

    # Filter the URLs to keep only the ones flagged as False or equivalent:
    url_df = url_df[(url_df['Flag as'].isin(
        ['False', 'Partly false', 'Misleading', 'False headline']))]

    # Use a REGEX to get the article field from the fact-check url website:
    # if the fact-check url starts with 'https://climatefeedback.org' -> 'climate' article
    # if the fact-check url starts with 'https://healthfeedback.org'  -> 'health' article
    fact_check_df['field'] = fact_check_df['Review url'].str.extract(
        'https://([^/]+)feedback.org')

    # Merge the two dataframes to get the 'field' for each url:
    url_df = url_df.dropna(subset=['Item reviewed'])
    fact_check_df = fact_check_df.dropna(subset=['Items reviewed'])
    url_df = url_df.merge(fact_check_df[['Items reviewed', 'field', 'topic']],
                          left_on='Item reviewed',
                          right_on='Items reviewed',
                          how='left')

    # Keep only the URL about the scientific topic of interest:
    url_df.loc[url_df['topic'] == 'COVID-19', 'field'] = 'COVID-19'
    url_df = url_df.dropna(subset=['field'])
    url_df = url_df[url_df['field'] == SCIENTIFIC_TOPIC]

    # Clean the URLs and extract its domain name:
    url_df['url'] = url_df['url'].apply(lambda x: ural.normalize_url(
        x, strip_protocol=False, strip_trailing_slash=True))
    url_df['domain_name'] = url_df['url'].apply(
        lambda x: ural.get_domain_name(x))

    # Remove the URLs that are in double in the dataframe,
    # keeping only the first, i.e. the more recent ocurrence.
    url_df = url_df.drop_duplicates(subset="url", keep="first")

    # # Remove the plateforms from the analysis:
    # plateforms = ["facebook.com", "youtube.com", "twitter.com", "wordpress.com", "instagram.com"]
    # url_df = url_df[~url_df['domain_name'].isin(plateforms)]

    # # Remove the url with parameters from the analysis because CT return wrong results for them:
    # url_df['parameter_in_url'] = url_df['url'].apply(lambda x: '?' in x)
    # url_df = url_df[url_df['parameter_in_url']==False]

    url_df = url_df[['url', 'Item reviewed', 'field', 'domain_name']]

    return url_df
コード例 #9
0
def extract_users_urls_medias_from_csv(f,
                                       trie,
                                       of=sys.stdout,
                                       total=None,
                                       filter_fr=False,
                                       min_date=None):
    headers = [
        'tweet_id', 'user_screen_name', 'user_id', 'normalized_url',
        'domain_name', 'webentity', 'datetime', 'is_retweet', 'nb_followers'
    ]
    writer = csv.writer(of)
    writer.writerow(headers)
    casa = casanova.reader(f)
    try:
        for row, (tid, uname, uid, dtime, rtid, nbfols, links,
                  lang) in tqdm(enumerate(
                      casa.cells([
                          'id', 'from_user_name', 'from_user_id', 'created_at',
                          'retweeted_id', 'from_user_followercount', 'links',
                          'lang'
                      ])),
                                total=total):
            if filter_fr and lang != 'fr':
                continue
            if min_date and dtime < min_date:
                continue
            is_rt = (rtid != '')
            for url in links.split('|'):
                url = url.strip()
                if not url:
                    continue
                webentity = trie.match(url)
                normalized = normalize_url(url)
                domain = normalized.split("/")[0]
                if not webentity:
                    #if "twitter.com/" not in url and "youtube.com" not in url:
                    #    print('WARNING: url unmatched on row #%s: %s' % (row, domain), file=sys.stderr)
                    continue
                writer.writerow([
                    tid, uname, uid, normalized, domain, webentity, dtime,
                    is_rt, nbfols
                ])

    except Exception as e:
        print(
            'ERROR while processing row #%s (https://twitter.com/%s/statuses/%s)'
            % (row, uname, tid),
            file=sys.stderr)
        raise (e)
コード例 #10
0
def clean_url_format(url_df):

    url_df['url'] = url_df['url'].transform(lambda x: x.strip())

    url_df['url_cleaned'] = url_df['url']\
        .apply(lambda x: ural.normalize_url(x,
                                            strip_protocol=False,
                                            strip_trailing_slash=True))
    url_df['domain_name'] = url_df['url_cleaned'].apply(
        lambda x: ural.get_domain_name(x))

    # Remove the URLs that are in double in the dataframe,
    # keeping only the first, i.e. the more recent ocurrence.
    url_df = url_df.drop_duplicates(subset="url", keep="first")

    return url_df
コード例 #11
0
ファイル: normalize.py プロジェクト: oubine/ural
def normalize_action(namespace):
    sort_query = not namespace.no_query_sort
    strip_authentication = not namespace.keep_authentication
    strip_trailing_slash = namespace.strip_trailing_slash
    strip_index = not namespace.keep_index

    headers, position, reader = custom_reader(namespace.file, namespace.column)

    headers.append(namespace.column + "_normalized")
    writer = csv.writer(namespace.output)
    writer.writerow(headers)

    for line in reader:
        url = line[position]
        line.append(normalize_url(url, sort_query=sort_query, strip_authentication=strip_authentication,
                                  strip_trailing_slash=strip_trailing_slash, strip_index=strip_index))
        writer.writerow(line)
def filter_and_enrich_tweets_from_csv(f,
                                      cat_urls,
                                      codes,
                                      categories,
                                      of=sys.stdout,
                                      total=None):
    add_fields = ["matched_urls", "webentities"] + categories + codes
    casa = casanova.enricher(f, of, add=add_fields)
    links_pos = casa.pos.links
    len_row = len(casa.fieldnames) - casa.added_count
    add_pos = {field: i for i, field in enumerate(add_fields)}
    try:
        for row in tqdm(casa, total=total):
            links = [
                normalize_url(u.strip(), strip_trailing_slash=True)
                for u in row[links_pos].split('|')
            ]
            if not links:
                continue
            webentities = set()
            matched_urls = set()
            add_row = ['', ''] + [False for i in categories
                                  ] + [False for j in codes]
            for u in links:
                infos = cat_urls.get(u, None)
                if infos:
                    matched_urls.add(u)
                    add_row[add_pos[infos[0]]] = True
                    add_row[add_pos[infos[1]]] = True
                    webentities.add(infos[2])
            add_row[add_pos["webentities"]] = "|".join(webentities)
            add_row[add_pos["matched_urls"]] = "|".join(matched_urls)

            if matched_urls:
                casa.writerow(row, add_row)

    except Exception as e:
        print("ERROR while processing", row, file=sys.stderr)
        raise (e)
コード例 #13
0
def url_parse_action(namespace):

    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(desc='Parsing',
                       dynamic_ncols=True,
                       unit=' rows',
                       total=namespace.total)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        loading_bar.update()

        if namespace.separator:
            urls = url.split(namespace.separator)
        else:
            urls = [url]

        for url in urls:
            if not is_url(url, allow_spaces_in_path=True):
                enricher.writerow(row)
                continue

            enricher.writerow(row, [
                normalize_url(url,
                              strip_protocol=namespace.strip_protocol,
                              strip_trailing_slash=True),
                get_domain_name(url),
                get_hostname(url),
                get_normalized_hostname(url)
            ])

    output_file.close()
def filter_and_enrich_tweets_from_mongo(db, cat_urls, of=sys.stdout):
    categories = list(cat_urls.keys())
    fields = "id,time,created_at,from_user_name,text,filter_level,possibly_sensitive,withheld_copyright,withheld_scope,withheld_countries,truncated,retweet_count,favorite_count,reply_count,lang,to_user_name,to_user_id,in_reply_to_status_id,source,source_name,source_url,location,lat,lng,from_user_id,from_user_realname,from_user_verified,from_user_description,from_user_url,from_user_profile_image_url,from_user_utcoffset,from_user_timezone,from_user_lang,from_user_tweetcount,from_user_followercount,from_user_friendcount,from_user_favourites_count,from_user_listed,from_user_withheld_scope,from_user_withheld_countries,from_user_created_at,collected_via_search,collected_via_stream,collected_via_thread_only,collected_at_timestamp,retweeted_id,retweeted_user_name,retweeted_user_id,quoted_id,quoted_user_name,quoted_user_id,links,medias_urls,medias_files,mentioned_user_names,mentioned_user_ids,hashtags".split(
        ",")
    headers = fields + ["matched_urls", "webentities"] + categories
    writer = csv.DictWriter(of, fieldnames=headers, extrasaction="ignore")
    writer.writeheader()

    for t in tqdm(db.find(), total=db.count()):
        if len(t.keys()) < 10:
            continue
        for f in fields:
            t[f] = get_field(f, t)
        try:
            links = [normalize_url(u) for u in t["links"].split('|')]
            if not links:
                continue

            t["matched_urls"] = []
            t["webentities"] = set()
            for cat in categories:
                cat_match = False
                for we, urls in cat_urls[cat].items():
                    for u in links:
                        if u in urls:
                            cat_match = True
                            t["matched_urls"].append(u)
                            t["webentities"].add(we)
                            links.remove(u)
                t[cat] = cat_match

            if t["webentities"]:
                t["matched_urls"] = "|".join(t["matched_urls"])
                t["webentities"] = "|".join(t["webentities"])
                writer.writerow(t)

        except Exception as e:
            print("ERROR while processing", t, file=sys.stderr)
            raise (e)
コード例 #15
0
with open(CORPUS) as f, open(OUTPUT, 'w') as wf:
    reader = csv.DictReader(f)
    writer = csv.DictWriter(wf,
                            fieldnames=reader.fieldnames + ['twitter_search'])
    writer.writeheader()

    for line in reader:
        batch = line['batch (TAGS)']

        if batch == 'EU':
            continue

        prefixes = LRUTrie(**NORMALIZE_KWARGS)

        for prefix in line['PREFIXES AS URL'].split(' '):
            prefixes.set(prefix, prefix)

        matching_prefix = prefixes.match(line['HOME PAGE'])

        if matching_prefix is None:
            print()
            print('Alaaaarm!', line)
            print()

        search_keyword = normalize_url(matching_prefix, **NORMALIZE_KWARGS)

        print(line['NAME'], '=>', search_keyword)

        line['twitter_search'] = search_keyword
        writer.writerow(line)
コード例 #16
0
with open('config.json') as confile:
    conf = json.loads(confile.read())

db = MongoClient(conf['mongo']['host'],
                 conf['mongo']['port'])[conf['mongo']['db']]['tweets']

urls = defaultdict(int)
query = {}
#query["langs"] = "fr"

print "Counting matching results..."
count = db.count(query)
print "Querying and hashing results..."
bar = progressbar.ProgressBar(max_value=count)
for t in bar(
        db.find(query, limit=count, projection={
            "links": 1,
            "proper_links": 1
        })):
    for l in t.get("proper_links", t["links"]):
        d = normalize_url(l.split("/")[2])
        urls[d] += 1

print "Sorting and storing csv data..."
with open("shared_domains.csv", "w") as f:
    print >> f, "domain,shares"
    bar = progressbar.ProgressBar(max_value=len(urls))
    for link, shares in bar(sorted(urls.items(), key=lambda x: -x[1])):
        print >> f, '%s,%s' % (format_csv(link), shares)
コード例 #17
0
# TODO: improve heuristics

with open(SHARES, 'r') as f:
    reader = csv.reader(f)
    next(reader)

    bar = ProgressBar()

    for line in bar(reader):

        urls = line[1].split('|')

        for url in urls:

            url = normalize_url(url)

            # Dropping homepages
            if '/' not in url:
                continue

            # Black lists
            if url in URL_BLACKLIST:
                continue

            if any(token in url for token in TOKENS_BLACKLIST):
                continue

            if any(url.endswith(token) for token in FINAL_TOKENS_BLACKLIST):
                continue
コード例 #18
0
with open('config.json') as confile:
    conf = json.loads(confile.read())

db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets']

urls = defaultdict(int)
query = {}
print "Counting matching results..."
count = db.count(query)

print "Building and storing csv data..."
with open("users_urls_domains.csv", "w") as f:
    print >> f, "user_screenname,link,domain,datetime,is_retweet"
    bar = progressbar.ProgressBar(max_value=count)
    for t in bar(db.find(query, limit=count, projection={"user_screen_name": 1, "links": 1, "proper_links": 1, "retweet_id": 1, "created_at": 1})):
        links = t.get("proper_links", t["links"])
        if not links:
            continue
        name = t.get("user_screen_name")
        isRT = 1 if t["retweet_id"] else 0
        dtime = isodate(t["created_at"])
        for l in links:
            lnk = normalize_url(l)
            try:
                domain = normalize_url(l.split("/")[2])
            except Exception as e:
                print >> sys.stderr, "ERROR normalizing domain for url", l, type(e), e
                domain = ""
            print >> f, ",".join([format_csv(v) for v in [name, lnk, domain, dtime, str(isRT)]])
        if LIMIT is not None and next(count) > LIMIT:
            bar.finish()
            break

        user = line['user_screenname']
        user_id = USER_IDS[user]
        links = line['links'].split('|')

        for link in links:
            media = MEDIAS_TRIE.longest(link)

            if media:

                USER_VECTORS[media[NAME_FIELD]][user_id] += 1

                norm_link = normalize_url(link)
                MEDIAS_URLS[media[NAME_FIELD]].add(norm_link)

                writer.writerow({
                    'user': user,
                    'media': media[NAME_FIELD],
                    'normalized_url': norm_link
                })

MEDIAS = list(set([media[NAME_FIELD] for media in MEDIAS_TRIE.values]))

print('Found %i unique users.' % len(USER_IDS))
print('Found %i unique medias.' % len(MEDIAS))

print('Computing media norms...')
コード例 #20
0
def parse_youtube_url(url):
    url = clean_url(url)
    u = normalize_url(url,
                      strip_lang_subdomains=True,
                      strip_trailing_slash=True)
    parsed = urlparse(url)
    # URL pattern youtu.be/VIDEO_ID
    if parsed.netloc == 'youtu.be':
        if "/" not in u:
            return "home", None
        url_id = u.split("/")[1]
        url_id = u.split("?")[0]
        url_id = u.split("%")[0]
        return "video", url_id
    # URL pattern youtube.googleapis.com/v/VIDEO_ID
    if parsed.netloc == 'youtube.googleapis.com':
        if "/v/" in u:
            url_id = u.split("/")[2]
        else:
            raise (Exception("Wrong url format %s" % u))
        return "video", url_id
    if parsed.netloc in [
            'img.youtube.com', 'gaming.youtube.com', 'music.youtube.com',
            'studio.youtube.com'
    ]:
        return "irrelevant", None
    if parsed.netloc.endswith('youtube.com'):
        if u in ["youtube.com"] and not parsed.fragment:
            return "home", None
        stem0 = parsed.path.split("/")[1]
        stem1 = parsed.path.split("/")[2] if "/" in parsed.path.lstrip(
            "/") else None
        queryargs = parsed.query.split("&")
        if stem0 in [
                "t", "yt", "results", "playlist", "artist", "channels",
                "audiolibrary", "feed", "intl", "musicpremium", "premium",
                "show", "watch_videos", "comment", "creators",
                "profile_redirector", "static", "view_play_list", "index"
        ]:
            return "irrelevant", None
        # URL pattern youtube.com/channel/CHANNEL_ID
        if stem0 == "channel":
            return "channel", stem1
        # URL pattern youtube.com/user/USER_ID
        if stem0 in ["user", "c"]:
            return "user", stem1
        # URL pattern youtube.com/profile_videos?user=USER_ID
        if stem0 == "attribution_link":
            uarg = [arg for arg in queryargs if arg.startswith("u=")]
            if len(uarg):
                return parse_youtube_url("http://youtube.com" +
                                         unquote(uarg[0].split("=")[1]))
        if stem0 in ["profile_videos", "subscription_center"]:
            uarg = [
                arg for arg in queryargs
                if arg.startswith("user="******"add_user="******"user", uarg[0].split("=")[1]
        # URL pattern youtube.com/v/VIDEO_ID
        if stem0 in ["v", "embed", "video"]:
            return "video", stem1
        # URL pattern youtube.com/watch?v=VIDEO_ID
        if stem0 in [
                "watch", "redirect", "comment_servlet", "all_comments",
                "watch_popup"
        ]:
            varg = [arg for arg in queryargs if arg.startswith("v=")]
            if len(varg):
                return "video", varg[0].split("=")[1]
            return "video", None
        if stem0 in ["edit", "swf"]:
            varg = [arg for arg in queryargs if arg.startswith("video_id=")]
            if len(varg):
                return "video", varg[0].split("=")[1]
            return "video", None
        # URL pattern youtube.com/#%2Fwatch%3Fv%3DVIDEO_ID
        if "v%3D" in parsed.query:
            fquery = unquote(parsed.query)
            queryargs = fquery.split("?")[1].split("&")
            varg = [arg for arg in queryargs if arg.startswith("v=")]
            if len(varg):
                return "video", varg[0].split("=")[1]
        if "v%3D" in parsed.fragment:
            fquery = unquote(parsed.fragment)
            queryargs = fquery.split("?")[1].split("&")
            varg = [arg for arg in queryargs if arg.startswith("v=")]
            if len(varg):
                return "video", varg[0].split("=")[1]
        if "continue=" in parsed.query:
            urlarg = [arg for arg in queryargs
                      if arg.startswith("continue=")][0].split("=")[1]
            return parse_youtube_url(unquote(urlarg))
        if not stem1 and (not parsed.query or parsed.query
                          in ["sub_confirmation=1"]) and not parsed.fragment:
            return "user", stem0
    return "error", None
コード例 #21
0
ファイル: analysis.py プロジェクト: zanachka/ural
from urllib.parse import urlsplit, parse_qsl
from collections import Counter
from tqdm import tqdm

from ural import normalize_url

TOP = 50

FRAGMENTS = Counter()
QUERIES = Counter()
QUERIES_COMBO = Counter()

with open('./scripts/data/urls.csv') as f:
    for line in tqdm(f, desc='Reading urls'):
        url = line.strip()[1:-1]
        url = normalize_url(url, strip_protocol=False)
        parsed = urlsplit(url)

        FRAGMENTS[parsed.fragment] += 1

        if parsed.query:
            for name, value in parse_qsl(parsed.query):
                QUERIES[name] += 1
                QUERIES_COMBO['%s=%s' % (name, value)] += 1


def report(name, counter):
    print()

    title = 'Top %i %s:' % (TOP, name)
    print(title)
コード例 #22
0
parser.add_argument('sources', help='CSV files of sources', type=FileType('r'))
parser.add_argument('target', help='Target CSV file', type=FileType('r'))
parser.add_argument('-o',
                    '--output',
                    help='output file',
                    type=FileType('w'),
                    default=sys.stdout)

args = parser.parse_args()

# Indexing
trie = LRUTrie(strip_trailing_slash=True)
index = {}

for line in csv.DictReader(args.sources):
    trie.set(normalize_url(line['url'], strip_trailing_slash=True), line)
    index[line['mediacloud_id']] = line

reader = csv.DictReader(args.target)
writer = csv.DictWriter(args.output,
                        fieldnames=reader.fieldnames +
                        ['polarisation_id', 'polarisation_name', 'webentity'])
writer.writeheader()

for line in reader:
    url = line['url']

    addendum = {
        'polarisation_id': '',
        'polarisation_name': '',
        'webentity': ''
                 "proper_links": 1,
                 "retweet_id": 1,
                 "created_at": 1,
                 "user_followers": 1
             })):
 links = t.get("proper_links", t["links"])
 if not links:
     continue
 name = t.get("user_screen_name")
 uid = t.get("user_id_str")
 dtime = isodate(t["created_at"])
 isRT = str(1 if t["retweet_id"] else 0)
 fols = str(t["user_followers"])
 for l in links:
     try:
         lnk = normalize_url(l.decode("utf-8"))
     except Exception as e:
         print >> sys.stderr, "ERROR: url misformatted", l, type(e), e
         lnk = l
     try:
         domain = normalize_url(l.split("/")[2])
     except Exception as e:
         print >> sys.stderr, "ERROR: normalizing domain for url", l, type(
             e), e
         domain = ""
     try:
         media = trie.match(l) or ""
     except Exception as e:
         print >> sys.stderr, "ERROR: LRUtrie matching crashes for url", l, type(
             e), e
         media = ""
コード例 #24
0
import csv
from ural import normalize_url

with open('./scripts/data/tricky.csv') as f:
    reader = csv.DictReader(f)

    for line in reader:
        if not line['expanded_links']:
            continue

        for url in line['expanded_links'].split('|'):
            try:
                normalize_url(url)
            except Exception as e:
                print(e, url)
コード例 #25
0
from collections import defaultdict
from pymongo import MongoClient
from ural import normalize_url
from gazouilloire.web.export import format_csv

with open('config.json') as confile:
    conf = json.loads(confile.read())

db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets']

urls = defaultdict(int)
query = {}
#query["langs"] = "fr"

print "Counting matching results..."
count = db.count(query)
print "Querying and hashing results..."
bar = progressbar.ProgressBar(max_value=count)
for t in bar(db.find(query, limit=count, projection={"links": 1, "proper_links": 1})):
    for l in t.get("proper_links", t["links"]):
        d = normalize_url(l.split("/")[2])
        urls[d] += 1

print "Sorting and storing csv data..."
with open("shared_domains.csv", "w") as f:
    print >> f, "domain,shares"
    bar = progressbar.ProgressBar(max_value=len(urls))
    for link, shares in bar(sorted(urls.items(), key = lambda x: -x[1])):
        print >> f, '%s,%s' % (format_csv(link), shares)

コード例 #26
0
MEDIAS = {}

for media in trie.values:
    if not media['mediacloud_id']:
        continue
    MEDIAS[int(media['mediacloud_id'])] = media

# Indexing urls
with open(SHARED_URLS_FILE, 'r') as f:
    reader = csv.reader(f)
    next(reader)

    bar = ProgressBar()

    for line in bar(reader):
        DEDUPED_URLS[normalize_url(line[0])] = int(line[1])

bar.finish()

# Retrieving mediacloud urls
with open(OUTPUT, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['query', 'id', 'date', 'url', 'normalized', 'title', 'media', 'mediacloud_id', 'shares'])
    writer.writeheader()

    for query in QUERIES:
        print('Query "%s"' % query)

        nb_batches = 0
        last = 0

        # TODO: add mediacloud_id + match by id rather
コード例 #27
0
    def test_normalize_url(self):
        for url, normalized in TESTS:
            assert normalize_url(url) == normalized, url

        for url, normalized, kwargs in TESTS_ADVANCED:
            assert normalize_url(url, **kwargs) == normalized, url
コード例 #28
0
            # TODO: output them
            print('Could not match', line['NAME'], line['STATUS'])
        else:
            NAME_INDEX[custom_fingerprint(line['NAME'])] = match
            NGRAMS_NAME_INDEX[custom_ngrams_fingerprint(line['NAME'])] = record

        # Warning for entities having dubious home pages
        home_page = line['HOME PAGE']

        if 'twitter' in home_page or 'facebook' in home_page:
            print('WARNING: %s has a dubious home page %s' %
                  (line['NAME'], home_page))

        # Printing report
        unique_prefixes = set(
            normalize_url(prefix, strip_trailing_slash=True)
            for prefix in prefixes
            if not ('twitter' in prefix or 'facebook' in prefix
                    or 'google' in prefix or 'pinterest' in prefix))

        if len(unique_prefixes) < 2:
            continue

        p('')
        p('## %s' % line['NAME'])
        p('')

        for prefix in sorted(unique_prefixes, key=lambda p: len(p.split('/'))):
            p('* %s' % prefix)

# Sanity check
コード例 #29
0
with open("users_urls_domains.csv", "w") as f:
    print >> f, "user_screenname,link,domain,datetime,is_retweet"
    bar = progressbar.ProgressBar(max_value=count)
    for t in bar(
            db.find(query,
                    limit=count,
                    projection={
                        "user_screen_name": 1,
                        "links": 1,
                        "proper_links": 1,
                        "retweet_id": 1,
                        "created_at": 1
                    })):
        links = t.get("proper_links", t["links"])
        if not links:
            continue
        name = t.get("user_screen_name")
        isRT = 1 if t["retweet_id"] else 0
        dtime = isodate(t["created_at"])
        for l in links:
            lnk = normalize_url(l)
            try:
                domain = normalize_url(l.split("/")[2])
            except Exception as e:
                print >> sys.stderr, "ERROR normalizing domain for url", l, type(
                    e), e
                domain = ""
            print >> f, ",".join(
                [format_csv(v) for v in [name, lnk, domain, dtime,
                                         str(isRT)]])