-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap.py
91 lines (77 loc) · 2.95 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from bs4 import BeautifulSoup
from getData import getData, getIds
from dataset import Dataset
import re
# Output:
# If found CPVs, returns array of CPVs
# Otherwise, returns None
# TEST: Finds 60 IT CPVs in Jan 2020
def extractCPVsFromSoup(soup):
cpv_labels = soup.find_all(text=re.compile('CPV')) # 5. Códigos CPV:
cpvs = []
for cpv_label in cpv_labels:
if (cpv_label is not None):
cpv_label_dt_tag = cpv_label.parent # <dt>5. Códigos CPV:</dt>
cpv_dd_tag = cpv_label_dt_tag.findNext('dd') # <dd>72000000 (Servicios TI: consultoría, desarrollo de software, Internet y apoyo).</dd>
cpv_text = cpv_dd_tag.contents[0] # 72000000 (Servicios TI: consultoría, desarrollo de software, Internet y apoyo).
cpvs += [int(s) for s in cpv_text.split() if s.isdigit()]
if (len(cpvs) == 0):
return None
cpvs = list(dict.fromkeys(cpvs))
return cpvs
# TEST: Finds 62 IT CPVs in Jan 2020
def extractCPVsFromData(data):
cpvs = [int(s) for s in re.findall(r'\d+', data) if len(s) == 8]
return cpvs
def extractInvestmentsFromData(data):
# costs = [(s.replace('.', '')).replace(',', '.') for s in re.findall(r'\d+,\d\d euros', data)]
costs = [(s.replace('.', '')).replace(',', '.') for s in re.findall(r'(?:\.*\d+)*,\d\d euros', data)]
print(costs)
return costs
def containsITCPV(cpvs):
contains_it_cpv = False
for cpv in cpvs:
if (cpv >= 72000000 and cpv < 73000000):
contains_it_cpv = True
break
return contains_it_cpv
def main():
dataset = Dataset()
counts = {
'cpvs_not_found': 0,
'it': 0,
'not_it': 0
}
idsDictionary = getIds('jan2020.csv')
for date in idsDictionary:
ids = idsDictionary[date]
for id in ids:
data = getData(id)
# soup = BeautifulSoup(data, 'html.parser')
# cpvs = extractCPVsFromSoup(soup)
cpvs = extractCPVsFromData(data)
# print('Found CPVs:', cpvs)
if (cpvs is not None):
if(containsITCPV(cpvs)):
investments = extractInvestmentsFromData(data)
print('Adding document to dataset...')
dataset.addEntry({
'id': id,
'cpv': cpvs,
'date': date,
'investments': investments
})
counts['it'] += 1
else:
counts['not_it'] += 1
else:
raise Exception('CPVs NOT FOUND ' + id)
counts['cpvs_not_found'] += 1
print('Results:')
print('IT Licitations Found: ' + str(counts['it']))
print('Not-IT Licitations Found: ' + str(counts['not_it']))
print('Licitations with unknown CPVs: ' + str(counts['cpvs_not_found']))
print('Exporting dataset...')
dataset.exportAsCSV('./data.csv')
if __name__ == '__main__':
main()