-
Notifications
You must be signed in to change notification settings - Fork 0
/
aa.py
executable file
·127 lines (104 loc) · 3.31 KB
/
aa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#from BeautifulSoup import BeautifulSoup, SoupStrainer
import requests
from xgoogle.search import GoogleSearch, SearchError
from xgoogle.googlesets import GoogleSets, LARGE_SET, SMALL_SET
import pickle
import re
import datetime
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from table_def import Letters
import urllib
#import scraperwiki
import pdfminer
pdf_url = 'http://www.sec.gov/divisions/investment/13f/13flist2013q4.pdf'
searchTerms = [ 'shareholder letter', 'letter to shareholders' ]
def getUrls2( page_num ):
gs = GoogleSearch('shareholder letter')
gs.results_per_page = 50
gs.page = page_num
results = gs.get_results()
for item in results:
print item.url.encode("utf8")
def getUrls ( searchTerm ):
links = []
f = open('output.txt', 'w')
try:
gs = GoogleSearch( searchTerm)
gs.results_per_page = 50
results = gs.get_results()
for res in results:
links.append( res.url.encode("utf8") )
pickle.dump( links, f )
f.close()
return links
except SearchError, e:
print "Search failed: %s" % e
def checkWords( links ):
texts = []
words = 'richest men'.split(' ')
for link in links:
r = requests.get( link )
txt= r.text
sentences = re.findall(r"([^.]*\.)" ,txt)
for sentence in sentences:
if all(word in sentence for word in words):
texts.append( txt )
return texts
def addToDb( texts ):
engine = create_engine('sqlite:///shareholder_letters.db', echo=True)
Session = sessionmaker(bind=engine)
session = Session()
list_to_add = []
for text in texts:
aa = Letters( text )
print aa
list_to_add.append( Letters( aa ) )
print "add to list"
print list_to_add
# session.add(new_letter)
# session.commit()
def getSearchResultsByPageNum( search_term, start_num_end_num ):
num = start_num
end = end_num
while ( num < end ):
links = getUrls( 'shareholder letters', start )
num = num + 1
#f = open('output.txt', 'r')
#links = pickle.load(f)
#print links
#texts = checkWords( links )
#addToDb( texts )
#print "HELLO"
#path = '13flist2013q4.pdf'
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = urllib.urlretrieve( path )
fp = open( fp[0], 'rb')
print fp
print dir( fp )
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
#test_url = 'http://www.thirdavenuecapitalplc.com/ucits/shareholder-letters.asp'
#test_remote_pdf = 'http://www.thirdavenuecapitalplc.com/ucits/docs/shareholderletters/Q4%202013%20UCITS%20Letters.pdf'
#convert_pdf_to_txt(test_remote_pdf)
getUrls2( 11 )