Exemple #1
0
def main():
    initialize_logger('./log')
    
    generated = datetime.now()
    kind, source, outdir = parse_options(sys.argv[1:])
    logging.info("Parsing catalyst - Started")
    logging.info("Parsing catalyst - Source file: %(s)s" % {'s':source})
    logging.info("Parsing catalyst - Output directory: %(s)s" % {'s':outdir})
    logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s':kind})
    
    # 1. load and parse the JSON file into a RDF Graph
    
    graph = ec.inference.catalyst_graph_for(source)
    
    # 2. extract the usersnodes,comments from the graph
    if kind == 'simple':
        users,nodes,comments = ec.extract.simple.users_nodes_comments_from(graph)
    elif kind == 'excerpts':
        users,nodes,comments = ec.extract.excerpts.users_nodes_comments_from(graph)
    else:
        logging.info("Parsing catalyst - Extraction kind not supported")
        return
        
    # 3. sort the lists
    sorted_users = sorted(users, key=eu.sort_by('created'))
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    sorted_comments = sorted(comments, key=eu.sort_by('created'))
    
    # 4. saves the files
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)
    logging.info("Parsing catalyst - Completed")
Exemple #2
0
def convert_to_network(generated, graph, posts, creator_of_post, reply_of, moderator_test=None):
    all_creators = {creator_of_post.get(n, None) for n in posts}
    all_creators.discard(None)
    profile_of_account = {}
    nodes = [account_as_node(graph, account, profile_of_account, moderator_test) for account in all_creators]
    edges = []
    for post in posts:
        for i, replying in enumerate(reply_of.get(post, ())):
            post_id = stringify(post)
            if i:
                post_id = "%s__%d" % (post_id, i)
            edges.append(
                post_as_link(
                    graph,
                    post,
                    post_id,
                    replying,
                    profile_of_account[creator_of_post[post]],
                    profile_of_account[creator_of_post[replying]],
                    moderator_test,
                )
            )

    nodes.sort(key=eu.sort_by("created_ts"))
    edges.sort(key=eu.sort_by("ts"))

    # this is the network object
    # going forward it should be read from a serialized format to handle caching
    return {"meta": {"generated": int(generated.strftime("%s"))}, "edges": edges, "nodes": nodes}
def parse_cif(source, kind):
    logging.info("parse_source - Started")
    logging.info("parse_source - Source: %(s)s" % {'s': source})
    logging.info("parse_source - Extraction Kind: %(s)s" % {'s': kind})

    # 1. load and parse the JSON file into a RDF Graph
    graph = ec.inference.catalyst_graph_for(source)

    # 2. extract the usersnodes,comments from the graph
    if kind == 'simple':
        users, nodes, comments = ec.extract.simple.users_nodes_comments_from(
            graph)
    elif kind == 'excerpts':
        users, nodes, comments = ec.extract.excerpts.users_nodes_comments_from(
            graph)
    else:
        logging.info("Parsing catalyst - Extraction kind not supported")
        return

    # 3. sort the lists
    sorted_users = sorted(users, key=eu.sort_by('created'))
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    sorted_comments = sorted(comments, key=eu.sort_by('created'))

    # 4. return the data
    logging.info("Parsing catalyst - Completed")
    return (sorted_users, sorted_nodes, sorted_comments)
def main():
    initialize_logger('./log')

    generated = datetime.now()
    sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options(
        sys.argv[1:])
    logging.info("Parsing mailinglist - Started")
    logging.info("Parsing mailinglist - Source files: %(s)s" %
                 {'s': repr(sources)})
    logging.info("Parsing mailinglist - Output directory: %(s)s" %
                 {'s': outdir})

    # 1. load and parse each file in a list of messages
    logging.info("Parsing mailinglist - Reading the files")
    messages = []
    for file in sources:
        mbox = mailbox.mbox(file)
        for msg in mbox:
            messages.append(emt.Message(msg))

    # 2. build the threaded containers
    logging.info("Parsing mailinglist - Threading the messages")
    subject_table = emt.thread(messages)
    root_containers = [ctr for (subj, ctr) in subject_table.items()]
    containers = emp.promote_none_root_set_children(root_containers)

    if force_name_as_uid:
        emp.force_name_as_address(containers)

    # Debug
    if debug:
        print('==== Message threads ====')
        for container in containers:
            emp.print_container(container)
        print('=========================')

    # 3. extract the users nodes comments and sort them
    logging.info("Parsing mailinglist - Extracting the data")
    users, nodes, comments = emp.users_nodes_comments_from(
        containers, moderators, charset)
    sorted_users = sorted(users, key=eu.sort_by('created'))
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    sorted_comments = sorted(comments, key=eu.sort_by('created'))

    # 5. saves the files
    logging.info("Parsing mailinglist - Saving the files")
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)

    logging.info("Parsing mailinglist - Completed")
Exemple #5
0
def extract_multiauthor_post_edges(nodes_map, posts_map):
    # build the list of edges
    edges_list = []
    # a comment is 'valid' if it has a recipient and an author
    multiauthor_posts = [e for e in posts_map.values() if e.get('all_authors', None) and hasattr(e.get('all_authors', None), '__iter__') and len(e.get('all_authors', None))>1]
    logging.info("%(v)i multiauthor posts on %(t)i total" % {'v':len(multiauthor_posts), 't':len(posts_map.values())})
    
    # build the posts network to use for metrics
    for post in multiauthor_posts:
        for authors in itertools.product(post['all_authors'], post['all_authors']):
            if authors[0]!=authors[1]:
                link = {
                    'id': "{0}_{1}_{2}".format(authors[0],authors[1],post['created_ts']),
                    'source': authors[0],
                    'target': authors[1],
                    'ts': post['created_ts'],
                    'effort': post['length'],
                    'team': post['team']
                }
                if nodes_map.has_key(authors[0]):
                    nodes_map[authors[0]]['active'] = True
                else:
                    logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':authors[0]})  
        
                if nodes_map.has_key(authors[1]):
                    nodes_map[authors[1]]['active'] = True
                else:
                    logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':authors[1]})  
                edges_list.append(link)

    return sorted(edges_list, key=eu.sort_by('ts'))
Exemple #6
0
def extract_edges(nodes_map, comments_map):
    # build the list of edges
    edges_list = []
    # a comment is 'valid' if it has a recipient and an author
    valid_comments = [e for e in comments_map.values() if e.get('recipient_id', None) and e.get('author_id', None)]
    logging.info("%(v)i valid comments on %(t)i total" % {'v':len(valid_comments), 't':len(comments_map.values())})
    
    # build the comments network to use for metrics
    for comment in valid_comments:
        if nodes_map.has_key(comment['author_id']):
            nodes_map[comment['author_id']]['active'] = True
        else:
            logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':comment['author_id']})  
        
        if comment.get('post_all_authors', None) and hasattr(comment['post_all_authors'], '__iter__'):
            links = [make_edge(comment, recipient) for recipient in comment['post_all_authors']]
        else:
            links = [make_edge(comment, comment['recipient_id'])]
        
        for link in links:
            if nodes_map.has_key(link['target']):
                nodes_map[link['target']]['active'] = True
            else:
                logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':link['target']})  
            edges_list.append(link)


    return sorted(edges_list, key=eu.sort_by('ts'))
Exemple #7
0
def extract_edges(nodes_map, comments_map):
    # build the list of edges
    edges_list = []
    # a comment is 'valid' if it has a recipient and an author
    valid_comments = [e for e in comments_map.values() if e.get('recipient_id', None) and e.get('author_id', None)]
    logging.info("%(v)i valid comments on %(t)i total" % {'v':len(valid_comments), 't':len(comments_map.values())})
    
    # build the whole network to use for metrics
    for comment in valid_comments:
        link = {
            'id': "{0}_{1}_{2}".format(comment['author_id'],comment['recipient_id'],comment['created_ts']),
            'source': comment['author_id'],
            'target': comment['recipient_id'],
            'ts': comment['created_ts'],
            'effort': comment['length'],
            'team': comment['team']
        }
        if nodes_map.has_key(comment['author_id']):
            nodes_map[comment['author_id']]['active'] = True
        else:
            logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':comment['author_id']})  
    
        if nodes_map.has_key(comment['recipient_id']):
            nodes_map[comment['recipient_id']]['active'] = True
        else:
            logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':comment['recipient_id']})  
        edges_list.append(link)


    return sorted(edges_list, key=eu.sort_by('ts'))
Exemple #8
0
def extract_multiauthor_post_edges(nodes_map, posts_map):
    # build the list of edges
    edges_list = []
    # a comment is 'valid' if it has a recipient and an author
    multiauthor_posts = [e for e in posts_map.values() if e.get('all_authors', None) and hasattr(e.get('all_authors', None), '__iter__') and len(e.get('all_authors', None))>1]
    logging.info("%(v)i multiauthor posts on %(t)i total" % {'v':len(multiauthor_posts), 't':len(posts_map.values())})
    
    # build the posts network to use for metrics
    for post in multiauthor_posts:
        for authors in itertools.product(post['all_authors'], post['all_authors']):
            if authors[0]!=authors[1]:
                link = {
                    'id': "{0}_{1}_{2}".format(authors[0],authors[1],post['created_ts']),
                    'source': authors[0],
                    'target': authors[1],
                    'ts': post['created_ts'],
                    'effort': post['length'],
                    'team': post['team']
                }
                if nodes_map.has_key(authors[0]):
                    nodes_map[authors[0]]['active'] = True
                else:
                    logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':authors[0]})  
        
                if nodes_map.has_key(authors[1]):
                    nodes_map[authors[1]]['active'] = True
                else:
                    logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':authors[1]})  
                edges_list.append(link)

    return sorted(edges_list, key=eu.sort_by('ts'))
Exemple #9
0
def extract_edges(nodes_map, comments_map):
    # build the list of edges
    edges_list = []
    # a comment is 'valid' if it has a recipient and an author
    valid_comments = [e for e in comments_map.values() if e.get('recipient_id', None) and e.get('author_id', None)]
    logging.info("%(v)i valid comments on %(t)i total" % {'v':len(valid_comments), 't':len(comments_map.values())})
    
    # build the comments network to use for metrics
    for comment in valid_comments:
        if nodes_map.has_key(comment['author_id']):
            nodes_map[comment['author_id']]['active'] = True
        else:
            logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':comment['author_id']})  
        
        if comment.get('post_all_authors', None) and hasattr(comment['post_all_authors'], '__iter__'):
            links = [make_edge(comment, recipient) for recipient in comment['post_all_authors']]
        else:
            links = [make_edge(comment, comment['recipient_id'])]
        
        for link in links:
            if nodes_map.has_key(link['target']):
                nodes_map[link['target']]['active'] = True
            else:
                logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':link['target']})  
            edges_list.append(link)


    return sorted(edges_list, key=eu.sort_by('ts'))
Exemple #10
0
def main():
    initialize_logger('./log')

    generated = datetime.now()
    source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:])
    logging.info("Parsing tweets - Started")
    logging.info("Parsing tweets - Output directory: %(s)s" % {'s': outdir})

    # 1. load and parse the CSV file into a list of records
    if dumpto:
        tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.csv'
        dump_to = os.path.join(dumpto, tag)
    else:
        dump_to = None

    tweets = []
    if source_csv:
        tweets += et.parse.load_and_parse_csv(source_csv,
                                              sort_key='created_ts',
                                              dump_to=dump_to)

    if source_dir:
        tweets += et.parse.load_and_parse_from_dir(source_dir,
                                                   sort_key='created_ts',
                                                   dump_to=dump_to)

    # 2. extract the users from the tweets
    users = et.extract.extract_users(tweets)
    sorted_users = sorted(users, key=eu.sort_by('created'))
    # users = { 'users': [{'user': user_data} for user_data in users] }

    # 3. extract the nodes from the tweets
    nodes = et.extract.extract_nodes(tweets)
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] }

    # 4. extract the comments from the tweets
    comments = et.extract.extract_comments(tweets)
    sorted_comments = sorted(comments, key=eu.sort_by('created'))
    # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] }

    # 5. saves the files
    write_file(tweets, 'tweets.json', outdir)
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)
    logging.info("Parsing tweets - Completed")
def main():
    initialize_logger("./log")

    generated = datetime.now()
    sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options(sys.argv[1:])
    logging.info("Parsing mailinglist - Started")
    logging.info("Parsing mailinglist - Source files: %(s)s" % {"s": repr(sources)})
    logging.info("Parsing mailinglist - Output directory: %(s)s" % {"s": outdir})

    # 1. load and parse each file in a list of messages
    logging.info("Parsing mailinglist - Reading the files")
    messages = []
    for file in sources:
        mbox = mailbox.mbox(file)
        for msg in mbox:
            messages.append(emt.Message(msg))

    # 2. build the threaded containers
    logging.info("Parsing mailinglist - Threading the messages")
    subject_table = emt.thread(messages)
    root_containers = [ctr for (subj, ctr) in subject_table.items()]
    containers = emp.promote_none_root_set_children(root_containers)

    if force_name_as_uid:
        emp.force_name_as_address(containers)

    # Debug
    if debug:
        print("==== Message threads ====")
        for container in containers:
            emp.print_container(container)
        print("=========================")

    # 3. extract the users nodes comments and sort them
    logging.info("Parsing mailinglist - Extracting the data")
    users, nodes, comments = emp.users_nodes_comments_from(containers, moderators, charset)
    sorted_users = sorted(users, key=eu.sort_by("created"))
    sorted_nodes = sorted(nodes, key=eu.sort_by("created"))
    sorted_comments = sorted(comments, key=eu.sort_by("created"))

    # 5. saves the files
    logging.info("Parsing mailinglist - Saving the files")
    write_file(sorted_users, "users.json", outdir)
    write_file(sorted_nodes, "nodes.json", outdir)
    write_file(sorted_comments, "comments.json", outdir)

    logging.info("Parsing mailinglist - Completed")
Exemple #12
0
def compute_all_metrics(nodes_map, posts_map, comments_map, network, timesteps_range, timestep, timestep_window):
    metrics = {}
    
    # calculate the network metrics
    for ts in timesteps_range:
        metrics[ts] = metrics_for_ts(nodes_map, posts_map, comments_map, network, ts, timestep, timestep_window)
        
    return sorted(metrics.values(), key=sort_by('ts'))
Exemple #13
0
def compute_all_metrics(nodes_map, posts_map, comments_map, network, timesteps_range, timestep, timestep_window):
    metrics = {}
    
    # calculate the network metrics
    for ts in timesteps_range:
        metrics[ts] = metrics_for_ts(nodes_map, posts_map, comments_map, network, ts, timestep, timestep_window)
        
    return sorted([m for m in metrics.values() if m is not None], key=sort_by('ts'))
Exemple #14
0
def main():
    initialize_logger('./log')
    
    generated = datetime.now()
    source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:])
    logging.info("Parsing tweets - Started")
    logging.info("Parsing tweets - Output directory: %(s)s" % {'s':outdir})
    
    # 1. load and parse the CSV file into a list of records
    if dumpto:
        tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.csv'
        dump_to = os.path.join(dumpto, tag)
    else:
        dump_to = None
    
    tweets = []
    if source_csv:
        tweets += et.parse.load_and_parse_csv(source_csv, sort_key='created_ts', dump_to=dump_to)

    if source_dir:
        tweets += et.parse.load_and_parse_from_dir(source_dir, sort_key='created_ts', dump_to=dump_to)
        
    # 2. extract the users from the tweets
    users = et.extract.extract_users(tweets)
    sorted_users = sorted(users, key=eu.sort_by('created'))
    # users = { 'users': [{'user': user_data} for user_data in users] }

    # 3. extract the nodes from the tweets
    nodes = et.extract.extract_nodes(tweets)
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] }
    
    # 4. extract the comments from the tweets
    comments = et.extract.extract_comments(tweets)
    sorted_comments = sorted(comments, key=eu.sort_by('created'))
    # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] }
    
    # 5. saves the files
    write_file(tweets, 'tweets.json', outdir)
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)
    logging.info("Parsing tweets - Completed")
def load_and_parse_csv(source, sort_key=None, dump_to=None):
    raw_tweets = eu.resource.load_csv(source, dump_to=dump_to)
    tweets = [map_csv_data(t) for t in raw_tweets]
    tweets = [t for t in tweets if t]
    logging.info(
        "Parsing tweets - read %(t)i tweets in CSV, using %(v)i valid tweets" %
        {
            't': len(raw_tweets),
            'v': len(tweets)
        })
    return sorted(tweets, key=eu.sort_by(sort_key))
def load_and_parse_from_dir(source, sort_key=None, dump_to=None):
    raw_tweets = [
        json.load(open(f, 'r'))
        for f in glob.glob(os.path.join(source, "*.json"))
    ]
    tweets = [map_json_data(t) for t in raw_tweets]
    tweets = [t for t in tweets if t]
    logging.info(
        "Parsing tweets - read %(t)i tweets from JSON, using %(v)i valid tweets"
        % {
            't': len(raw_tweets),
            'v': len(tweets)
        })
    return sorted(tweets, key=eu.sort_by(sort_key))
def convert_to_network(generated,
                       graph,
                       posts,
                       creator_of_post,
                       reply_of,
                       moderator_test=None):
    all_creators = {creator_of_post.get(n, None) for n in posts}
    all_creators.discard(None)
    profile_of_account = {}
    nodes = [
        account_as_node(graph, account, profile_of_account, moderator_test)
        for account in all_creators
    ]
    edges = []
    for post in posts:
        for i, replying in enumerate(reply_of.get(post, ())):
            post_id = stringify(post)
            if i:
                post_id = '%s__%d' % (post_id, i)
            edges.append(
                post_as_link(graph, post, post_id, replying,
                             profile_of_account[creator_of_post[post]],
                             profile_of_account[creator_of_post[replying]],
                             moderator_test))

    nodes.sort(key=eu.sort_by('created_ts'))
    edges.sort(key=eu.sort_by('ts'))

    # this is the network object
    # going forward it should be read from a serialized format to handle caching
    return {
        'meta': {
            'generated': int(generated.strftime("%s"))
        },
        'edges': edges,
        'nodes': nodes
    }
Exemple #18
0
def load_and_parse_from_dir(source, sort_key=None, dump_to=None):
    raw_tweets = [json.load(open(f, 'r')) for f in glob.glob(os.path.join(source, "*.json"))]
    tweets = [map_json_data(t) for t in raw_tweets]
    tweets = [t for t in tweets if t]
    logging.info("Parsing tweets - read %(t)i tweets from JSON, using %(v)i valid tweets" % {'t': len(raw_tweets), 'v': len(tweets)})
    return sorted(tweets, key=eu.sort_by(sort_key))
Exemple #19
0
def load_and_parse_csv(source, sort_key=None, dump_to=None):
    raw_tweets = eu.resource.load_csv(source, dump_to=dump_to)
    tweets = [map_csv_data(t) for t in raw_tweets]
    tweets = [t for t in tweets if t]
    logging.info("Parsing tweets - read %(t)i tweets in CSV, using %(v)i valid tweets" % {'t': len(raw_tweets), 'v': len(tweets)})
    return sorted(tweets, key=eu.sort_by(sort_key))