-
Notifications
You must be signed in to change notification settings - Fork 0
/
LinkExtractor.py
202 lines (168 loc) · 5.99 KB
/
LinkExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
'''
Created on Aug 4, 2013
@author: rajath
'''
import os
import getpass
import re
import lxml.html
from xml.sax import saxutils
import xml.etree.ElementTree as et
import logging
from pytrie import SortedStringTrie as trie
# TODO: write a function to map facebook user-ids with their usernames
# and display them while searching for a username
dir_path = ''
# Source file path
src_file_path = os.getcwd()
# Get username on Linux system
username = getpass.getuser()
log_path = '/home/' + username + '/.purple/logs/jabber'
buddy_list_file = '/home/' + username + '/.purple/blist.xml'
re_exp = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
accounts = []
users = []
urls_dict = {}
fb_user_map = {}
# Map Facebook userIds with names since log file names have numbers in them
# Do an initial scan of the userIds and store it in a dictionary
# To be called after scanning accounts
def map_user_ids(accounts):
# Store the location of previous directory in a temporary buffer
prev_dir = os.getcwd()
#Parse the buddy list xml and store all mappings
users_dict = {}
tree = et.parse(buddy_list_file)
root = tree.getroot()
users = []
for account in accounts:
users = root.findall(".//*[@account='" + account + "/']")
for user in users:
try:
alias = user.find('alias').text.lower()
name = user.find('name').text.lower()
if not users_dict.has_key(alias):
user_id = [name]
users_dict[alias] = user_id
else:
user_id = users_dict[alias]
user_id.append(name)
users_dict[alias] = user_id
except:
# For some facebook ids, usernames are not available. Handle it in logs
pass
# Switch back to the previous directory
os.chdir(prev_dir)
return users_dict
# Extracts URL from a ping
def extract_url(line):
# Identify URL and extract it
refined_url = re.findall(re_exp, line)
if len(refined_url) == 0:
# throw exception
pass
return list(set([i.strip('<br/>').strip('</a>') for i in refined_url]))
# Searches for users based on the search term provided in the input
def search_users(accounts, search_term):
global dir_path
# Like search using PyTrie
t = trie(users_dict)
# Get list of aliases
# Search through aliases and get corresponding userIds and put them ALL in one list
aliases = t.keys(prefix=search_term)
users = []
names = []
for alias in aliases:
names = names + users_dict[alias]
for account in accounts:
dir_path = log_path + '/' + account
os.chdir(dir_path)
for name in names:
if name in os.listdir(dir_path):
users.append(account + '/' + name)
break
return users
# Lists different accounts for which logs are available
def list_accounts(log_path):
for account in os.listdir(log_path):
accounts.append(account)
return accounts
# Get all the links shared by each user
def get_links(users):
# Search in all accounts for that user. For example, the user may have accounts
# in both gmail and facebook. The following code will search in both.
for i in users:
log_files_path = log_path + '/' + i
os.chdir(log_files_path)
for filename in os.listdir(log_files_path):
date = filename.split('.')[0]
f = open(filename)
lines = f.readlines()
f.close()
for line in lines:
# if 'www' in line:
if re.findall(re_exp, line):
if not urls_dict.get(date):
url_list = []
else:
url_list = urls_dict[date]
urls_dict[date] = url_list + extract_url(line)
return urls_dict
# Get the title of the URL to display it more meaningfully
def get_url_title(url):
try:
t = lxml.html.parse(url)
except:
return url
title = t.find(".//title").text
return saxutils.escape(title)
# If it's a YouTube URL, embed the video in the output HTML
def embed_youtube_video(yt_link):
yt_link = yt_link.replace('watch?v=','embed/')
yt_link = yt_link[yt_link.find('www'):]
embed_text = '<iframe width="560" height="315" src="http://' + yt_link + '" frameborder="0" allowfullscreen></iframe>'
return embed_text
# Replace placeholders in HTML stub with actual links and corresponding dates
def insert_links_in_html(urls):
output = ''
for k, v in urls.iteritems():
date = '<h2>' + k + '</h2>'
links = ''
for link in v:
if 'youtube' in link:
embed_link = embed_youtube_video(link)
links += '<ul>' + get_url_title(link) + '<br/>' + embed_link + '</ul>'
else:
links += '<ul> <a href="' + link + '">' + get_url_title(link) + '</a></ul>'
output += date + links
return output
# Generate HTML from stub with links and dates replaced
def generate_html(urls):
global users
# Stubbed HTMLs are found in data folder in src directory.
# Temporarily 'cd'ing to this directory
tmp = os.getcwd()
os.chdir(src_file_path)
html_stub = open('data/html_stub.html')
generated_html = open('data/output_html.html', 'w')
for line in html_stub:
generated_html.write(line.replace('[INSERT_LINKS_HERE]',insert_links_in_html(urls)))
html_stub.close()
generated_html.close()
os.chdir(tmp)
return generated_html
accounts = list_accounts(log_path)
print 'Accounts Found: ', accounts
users_dict = map_user_ids(accounts)
search_term = raw_input('Search username:')
users = search_users(accounts, search_term)
if len(users) != 0:
print 'Users found:', users
else:
print 'No logs found'
urls = get_links(users)
if len(urls) != 0:
generate_html(urls)
print 'Output file generated!'
else:
print 'No links found!'