def hashTags(id): tags = [] export_uuid = uuid.uuid4() if not os.path.isdir(EXPORTS_BASEDIR): os.makedirs(EXPORTS_BASEDIR) q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first() with open( os.path.join(EXPORTS_BASEDIR, 'hashtags_{}.txt'.format(export_uuid)), 'w+') as f: for filename in os.listdir( os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title)): try: if filename.endswith(".gz"): for line in gzip.open( os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title, filename)): tweet = json.loads(line.decode('utf-8')) for tag in tweet['entities']['hashtags']: tags.append(tag['text'].lower()) except: continue counts = Counter(tags) for t in counts.most_common(10): f.write(str(t)) f.write('\n') addExportRef = models.EXPORTS(url='hashtags_{}.txt'.format(export_uuid), type='Hashtags', exported=datetime.utcnow(), count=None) q.exports.append(addExportRef) db.session.commit() db.session.close()
def dehydrateUserSearch(id): count = 0 export_uuid = uuid.uuid4() if not os.path.isdir(EXPORTS_BASEDIR): os.makedirs(EXPORTS_BASEDIR) q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first() with open( os.path.join(EXPORTS_BASEDIR, 'dehydrate_{}.txt'.format(export_uuid)), 'w+') as f: for filename in os.listdir( os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title)): if filename.endswith(".gz"): for line in gzip.open( os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title, filename)): try: count = count + 1 tweet = json.loads(line.decode('utf-8'))['id_str'] f.write(tweet) f.write('\n') except: continue addExportRef = models.EXPORTS(url='dehydrate_{}.txt'.format(export_uuid), type='Dehydrate', exported=datetime.now(), count=count) q.exports.append(addExportRef) db.session.commit() db.session.close()
def hashTagsCollection(id): tags = [] export_uuid = uuid.uuid4() if not os.path.isdir(EXPORTS_BASEDIR): os.makedirs(EXPORTS_BASEDIR) q = models.COLLECTION.query.filter(models.COLLECTION.row_id == id).first() linkedTargets = models.COLLECTION.query. \ filter(models.COLLECTION.row_id == id). \ first(). \ tags dbDateStart = q.inclDateStart dbDateStop = q.inclDateEnd with open( os.path.join(EXPORTS_BASEDIR, 'hashtags_{}.txt'.format(export_uuid)), 'w+') as f: for target in linkedTargets: print(target.title) try: for filename in os.listdir( os.path.join(ARCHIVE_BASEDIR, target.targetType, target.title[0], target.title)): if filename.endswith(".gz"): for line in gzip.open( os.path.join(ARCHIVE_BASEDIR, target.targetType, target.title[0], target.title, filename)): tweet = json.loads(line.decode('utf-8')) tweetDate = datetime.strptime( tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') for tag in tweet['entities']['hashtags']: if tweetDate > dbDateStart and tweetDate < dbDateStop: tags.append(tag['text'].lower()) except: continue counts = Counter(tags) for t in counts.most_common(10): f.write(str(t)) f.write('\n') addExportRef = models.EXPORTS(url='hashtags_{}.txt'.format( q.title, export_uuid), type='Hashtags', exported=datetime.utcnow(), count=None) q.exports.append(addExportRef) db.session.commit() db.session.close()
def dehydrateCollection(id): count = 0 export_uuid = uuid.uuid4() if not os.path.isdir(EXPORTS_BASEDIR): os.makedirs(EXPORTS_BASEDIR) q = models.COLLECTION.query.filter(models.COLLECTION.row_id == id).first() linkedTargets = models.COLLECTION.query. \ filter(models.COLLECTION.row_id == id). \ first(). \ tags dbDateStart = q.inclDateStart dbDateStop = q.inclDateEnd with open( os.path.join(EXPORTS_BASEDIR, 'dehydrate_{}.txt'.format(export_uuid)), 'w+') as f: for target in linkedTargets: print(target.title) for filename in os.listdir( os.path.join(ARCHIVE_BASEDIR, target.targetType, target.title[0], target.title)): if filename.endswith(".gz"): for line in gzip.open( os.path.join(ARCHIVE_BASEDIR, target.targetType, target.title[0], target.title, filename)): try: tweet = json.loads(line.decode('utf-8')) tweetDate = datetime.strptime( tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') except: continue if tweetDate > dbDateStart and tweetDate < dbDateStop: count = count + 1 f.write(tweet['id_str']) f.write('\n') addExportRef = models.EXPORTS(url='dehydrate_{}.txt'.format(export_uuid), type='Dehydrate', exported=datetime.now(), count=count) q.exports.append(addExportRef) db.session.commit() db.session.close()
def bagger(id): q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first() dest = os.path.join(EXPORTS_BASEDIR, q.title) shutil.copytree( os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title), dest) bag = bagit.make_bag( dest, { 'target-type': q.targetType, 'title': q.title, 'search-string': q.searchString, 'search-language': q.searchLang, 'description': q.description, 'keywords': q.subject }) make_archive(dest, os.path.join(EXPORTS_BASEDIR, '{}.zip'.format(q.title))) shutil.rmtree(dest) addExportRef = models.EXPORTS(url='{}.zip'.format(q.title), type='Bag', exported=datetime.now(), count=None) q.exports.append(addExportRef) db.session.commit() db.session.close()
def Followers(id): CREDENTIALS = models.CREDENTIALS.query.one() with app.test_request_context(): t = twarc.Twarc(consumer_key=CREDENTIALS.consumer_key, consumer_secret=CREDENTIALS.consumer_secret, access_token=CREDENTIALS.access_token, access_token_secret=CREDENTIALS.access_secret) count = 0 export_uuid = uuid.uuid4() if not os.path.isdir(EXPORTS_BASEDIR): os.makedirs(EXPORTS_BASEDIR) q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first() x = t.follower_ids(q.title) with open(os.path.join(EXPORTS_BASEDIR, 'followers_{}.txt'.format(export_uuid)), 'w+') as f: for u in x: count = count + 1 f.write(u) f.write('\n') addExportRef = models.EXPORTS(url='followers_{}.txt'.format(export_uuid),type='Followers',exported=datetime.now(),count=count) q.exports.append(addExportRef) db.session.commit() db.session.close()
def networkUserSearch(id, users, retweets, min_subgraph_size, max_subgraph_size, output): G = networkx.Graph() def add(from_user, from_id, to_user, to_id, type): "adds a relation to the graph" if users and to_user: G.add_node(from_user, screen_name=from_user) G.add_node(to_user, screen_name=to_user) G.add_edge(from_user, to_user, type=type) elif not users and to_id: G.add_node(from_id, screen_name=from_user, type=type) if to_user: G.add_node(to_id, screen_name=to_user) else: G.add_node(to_id) G.add_edge(from_id, to_id, type=type) def to_json(g): j = {"nodes": [], "links": []} for node_id, node_attrs in g.nodes(True): j["nodes"].append({ "id": node_id, "type": node_attrs.get("type"), "screen_name": node_attrs.get("screen_name") }) for source, target, attrs in g.edges(data=True): j["links"].append({ "source": source, "target": target, "type": attrs.get("type") }) return j count = 0 export_uuid = uuid.uuid4() if not os.path.isdir(EXPORTS_BASEDIR): os.makedirs(EXPORTS_BASEDIR) q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first() for filename in os.listdir( os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title)): if filename.endswith(".gz"): for line in gzip.open( os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title, filename)): try: t = json.loads(line.decode('utf-8')) except: continue from_id = t['id_str'] from_user = t['user']['screen_name'] from_user_id = t['user']['id_str'] to_user = None to_id = None count = count + 1 if users: for u in t['entities'].get('user_mentions', []): add(from_user, from_id, u['screen_name'], None, 'reply') else: if t.get('in_reply_to_status_id_str'): to_id = t['in_reply_to_status_id_str'] to_user = t['in_reply_to_screen_name'] add(from_user, from_id, to_user, to_id, "reply") if t.get('quoted_status'): to_id = t['quoted_status']['id_str'] to_user = t['quoted_status']['user']['screen_name'] to_user_id = t['quoted_status']['user']['id_str'] add(from_user, from_id, to_user, to_id, "quote") if retweets and t.get('retweeted_status'): to_id = t['retweeted_status']['id_str'] to_user = t['retweeted_status']['user']['screen_name'] to_user_id = t['retweeted_status']['user']['id_str'] add(from_user, from_id, to_user, to_id, "retweet") if min_subgraph_size or max_subgraph_size: g_copy = G.copy() for g in networkx.connected_component_subgraphs(G): if min_subgraph_size and len(g) < min_subgraph_size: g_copy.remove_nodes_from(g.nodes()) elif max_subgraph_size and len(g) > max_subgraph_size: g_copy.remove_nodes_from(g.nodes()) G = g_copy if output == 'gexf': networkx.write_gexf( G, os.path.join(EXPORTS_BASEDIR, 'gexf_{}.gexf'.format(export_uuid))) addExportRef = models.EXPORTS(url='gexf_{}.gexf'.format(export_uuid), type='Network(gexf)', exported=datetime.now(), count=count) q.exports.append(addExportRef) db.session.commit() db.session.close() elif output == ("json"): json.dump(to_json(G), open( os.path.join(EXPORTS_BASEDIR, 'json_{}.json'.format(export_uuid)), "w"), indent=2) addExportRef = models.EXPORTS(url='json_{}.json'.format(export_uuid), type='Network(json)', exported=datetime.now(), count=count) q.exports.append(addExportRef) db.session.commit() db.session.close() elif output == 'html': addExportRef = models.EXPORTS(url='html_{}.html'.format(export_uuid), type='Network(html)', exported=datetime.now(), count=count) q.exports.append(addExportRef) db.session.commit() db.session.close() graph_data = json.dumps(to_json(G), indent=2) html = """<!DOCTYPE html> <meta charset="utf-8"> <script src="https://platform.twitter.com/widgets.js"></script> <script src="https://d3js.org/d3.v4.min.js"></script> <script src="https://code.jquery.com/jquery-3.1.1.min.js"></script> <style> .links line { stroke: #999; stroke-opacity: 0.8; stroke-width: 2px; } line.reply { stroke: #999; } line.retweet { stroke-dasharray: 5; } line.quote { stroke-dasharray: 5; } .nodes circle { stroke: red; fill: red; stroke-width: 1.5px; } circle.retweet { fill: white; stroke: #999; } circle.reply { fill: #999; stroke: #999; } circle.quote { fill: yellow; stroke: yellow; } #graph { width: 99vw; height: 99vh; } #tweet { position: absolute; left: 100px; top: 150px; } </style> <svg id="graph"></svg> <div id="tweet"></div> <script> var width = $(window).width(); var height = $(window).height(); var svg = d3.select("svg") .attr("height", height) .attr("width", width); var color = d3.scaleOrdinal(d3.schemeCategory20c); var simulation = d3.forceSimulation() .velocityDecay(0.6) .force("link", d3.forceLink().id(function(d) { return d.id; })) .force("charge", d3.forceManyBody()) .force("center", d3.forceCenter(width / 2, height / 2)); var graph = %s; var link = svg.append("g") .attr("class", "links") .selectAll("line") .data(graph.links) .enter().append("line") .attr("class", function(d) { return d.type; }); var node = svg.append("g") .attr("class", "nodes") .selectAll("circle") .data(graph.nodes) .enter().append("circle") .attr("r", 5) .attr("class", function(d) { return d.type; }) .call(d3.drag() .on("start", dragstarted) .on("drag", dragged) .on("end", dragended)); node.append("title") .text(function(d) { return d.id; }); node.on("click", function(d) { $("#tweet").empty(); var rect = this.getBoundingClientRect(); var paneHeight = d.type == "retweet" ? 50 : 200; var paneWidth = d.type == "retweet" ? 75 : 500; var left = rect.x - paneWidth / 2; if (rect.y > height / 2) { var top = rect.y - paneHeight; } else { var top = rect.y + 10; } var tweet = $("#tweet"); tweet.css({left: left, top: top}); if (d.type == "retweet") { twttr.widgets.createFollowButton(d.screen_name, tweet[0], {size: "large"}); } else { twttr.widgets.createTweet(d.id, tweet[0], {conversation: "none"}); } d3.event.stopPropagation(); }); svg.on("click", function(d) { $("#tweet").empty(); }); simulation .nodes(graph.nodes) .on("tick", ticked); simulation.force("link") .links(graph.links); function ticked() { link .attr("x1", function(d) { return d.source.x; }) .attr("y1", function(d) { return d.source.y; }) .attr("x2", function(d) { return d.target.x; }) .attr("y2", function(d) { return d.target.y; }); node .attr("cx", function(d) { return d.x; }) .attr("cy", function(d) { return d.y; }); } function dragstarted(d) { if (!d3.event.active) simulation.alphaTarget(0.3).restart(); d.fx = d.x; d.fy = d.y; } function dragged(d) { d.fx = d3.event.x; d.fy = d3.event.y; } function dragended(d) { if (!d3.event.active) simulation.alphaTarget(0); d.fx = null; d.fy = null; } </script> """ % graph_data open(os.path.join(EXPORTS_BASEDIR, 'html_{}.html'.format(export_uuid)), "w").write(html)