/
crawler.py
88 lines (71 loc) · 2.67 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import xml.etree.ElementTree as ET
import time
import csv
import re
import util
import json
from review_score_crawler import crawl_review_score
# Remove double spaces, whitespaces and change everything to uppercase from profile data
def clean_ski(l):
for r in l:
r = r.strip()
re.sub(' {2,}', ' ', r)
csv_l = [r.upper().split() for r in l]
return csv_l
# Initialize chrome webdriver
def get_chrome_driver(config):
options = Options()
options.add_argument("--headless")
options.add_argument("--start-maximized")
options.add_argument("--incognito")
exec_path = config['chrome-webdriver']['windows']
driver = webdriver.Chrome(options=options, executable_path=exec_path)
return driver
def write_to_csv(lines):
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(lines)
def write_to_json(data):
with open('results.json', 'w', encoding='utf-8') as f:
json.dump(data, f)
# Crawls the friends
def crawl_friends(profile, driver):
driver.get(f'https://steamcommunity.com/id/{profile}/friends/')
elements = driver.find_elements_by_class_name("friend_block_v2")
friends = [i.get_attribute("data-steamid") for i in elements]
return friends
def get_steam_id(element, driver):
profile_link = element.text.split('/')[4]
driver.get(f'https://steamcommunity.com/id/{profile_link}?xml=1')
root = ET.fromstring(driver.page_source)
for i in root.iter('steamID64'):
steamID64 = i.text # The first ID encountered
return steamID64
def main():
config = util.load_config()
driver = get_chrome_driver(config)
data = {}
# Open reviews and pick first review to start crawling
driver.get("https://steamcommunity.com/?subsection=reviews")
print(driver.title)
# Locate submit button, click to get all results
driver.implicitly_wait(15)
reviewCards = driver.find_elements_by_class_name("apphub_Card")
reviewCards[1].click() # Mod as needed, some profiles are private
element = driver.find_element_by_xpath("//a[contains(@href, 'steamcommunity.com/id')]")
profile_link = element.text.split('/')[4]
steamID64 = get_steam_id(element, driver)
print("Writing to csv...")
data[steamID64] = crawl_friends(profile_link, driver)
write_to_json(data)
time.sleep(10)
# Close everything
driver.quit()
return 0
if __name__ == '__main__':
main()