/
fbread.py
75 lines (61 loc) · 1.78 KB
/
fbread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#! /usr/bin/Python
from bs4 import BeautifulSoup
import requests
import mechanize
from mechanize import Browser
import urllib2
import HTMLParser
import re
import HTMLParser
re.DOTALL
import os
import json
def mapPOS(facebookword):
facebook_map = {}
facebookword = facebookword.lower()
word = facebookword.translate(None, '",!.?!@#$%^&*()_-:')
if word in dictionary:
return dictionary[word]
def dictionary():
with open('dictionary.json', 'r') as f:
dictionary = json.load(f)
return dictionary
dictionary = dictionary()
# def run():
br = Browser()
br.set_handle_robots( False )
url = 'https://www.facebook.com/login/'
html = urllib2.urlopen(url)
br.open(url)
br.form = list(br.forms())[0]
br.form['email'] = 'emilylimabean@gmail.com'
br.form['pass'] = 'ItsaSunnyDay62'
req = br.submit()
html = req.read()
f = open('facebooktext.txt', 'w')
f.write(html)
f.close()
matches = re.findall(r"<p>([a-zA-Z0-9 !-)(?.,;:'\"_+= -]*)", open('facebooktext.txt', 'r').read())
os.remove('C:\Python27\mockingbird\\facebooktext.txt')
with open('facebook_dictionary.json', 'r') as fb:
fb_dictionary = json.load(fb)
unknown = open('unknown_facebook_words', 'a')
f = open('facebookwords.txt', 'a')
for match in matches:
stripped_text = HTMLParser.HTMLParser().unescape(match)
if stripped_text != '':
f.write(stripped_text + ' xYx ')
words = stripped_text.split()
for word in words:
word = str(word).lower().translate(None, '",!.?!@#$%^&*()_-:<>')
POS = mapPOS(word)
if POS:
for char in list(POS):
if char in fb_dictionary.keys():
fb_dictionary[char].append(word)
else:
fb_dictionary[char] = []
else:
unknown.write(word + ', ')
with open('facebook_dictionary.json', 'w') as fb:
json.dump(fb_dictionary, fb)